Here's the part of code that I want to run 4 times. Without counters it works as intended: link to next page is retrieved and scraped for relevant data:
def parse_commits_page(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page)
Here are the variants of the cycle I tried:
Adding a simple global counter:
count = 0
def parse_commits_page(self, response):
global count
while (count < 4):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
count = count + 1
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page)
Adding a sub-function:
def parse_commits_page(self, response):
def grabber( response ):
return {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
yield grabber( response )
for i in range(3):
yield response.follow(
response.xpath('//a[#rel="nofollow"]/#href')[-1].extract(),
callback=grabber
)
In case of counter the response value is updated either once (if placed as in this code) or not at all if count = count + 1 is placed at the end.
In case of sub function response is updated only on the last iteration, resulting in 2 scraped pages instead of 4.
What is the correct way to implement the cycle so that variables are updated as intended?
Here's complete code if that helps(I use 4 defs instead of a cycle right now):
# -*- coding: utf-8 -*-
import scrapy
from random import randint
from time import sleep
BASE_URL = 'https://github.com'
class DiscoverSpider(scrapy.Spider):
name = 'discover_commits_new'
allowed_domains = ['github.com']
start_urls = ['https://github.com/search?utf8=%E2%9C%93&q=stars%3E100&ref=simplesearch']
def parse(self, response):
# Select all the project urls on page
project = BASE_URL + response.xpath('//h3/a[#class="v-align-middle"]/#href').extract_first()
yield response.follow(project, self.parse_project)
# Random wait, so GitHub doesn't ban me right away
sleep(randint(5,20))
# Follow to the next page when every project on this one is scraped
next_page = response.xpath('//a[#rel="next"]/#href').extract_first()
if next_page is not None:
next_page = BASE_URL + next_page
: yield response.follow(next_page, callback=self.parse)
# Parse the main page of the project
def parse_project(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'tags': [x.strip() for x in response.css('.topic-tag::text').extract()],
'lang_name': response.css('.lang::text').extract(),
'lang_perc' : response.css('.percent::text').extract(),
'stars': response.css('.social-count::text').extract()[1].strip(),
'forks': response.css('.social-count::text').extract()[2].strip(),
'commits': response.css('.text-emphasized::text').extract()[0].strip(),
'contributors': response.css('.text-emphasized::text').extract()[3].strip()
}
commits_page = BASE_URL + response.xpath('//*[#class="commits"]//#href').extract_first()
yield response.follow(commits_page, self.parse_commits_page)
# Get last commits
def parse_commits_page(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page1)
def parse_commits_page1(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page2)
def parse_commits_page2(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page3)
def parse_commits_page3(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
Related
I am trying to scrape a website using python and scrapy but I have issues with saving the result.
error log i receive:
yield result = {
^
SyntaxError: invalid syntax
When i remove the "result = ", I don't get any error but the reason I am doing that is to save the result as a variable which I use at the last part of the code in "f.write(result)"
The code goes below:
import scrapy
class ExampleSpider(scrapy.Spider):
name = "ufcspider"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield result = {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'link': 'http://quotes.toscrape.com' + quote.css("span a::attr(href)").get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css("li.next a::attr(href)").get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callable=self.parse)
page = response.url.split("/")[-2]
filename = f'quotes-{page}.json'
with open(filename, 'wb') as f:
f.write(result)
self.log(f'Saved file {filename}')
First define result, next yield it
result = { ... }
yield result
I'm crawling through some directories with ASP.NET programming via Scrapy.
The pages to crawl through are encoded as such:
javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')
where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page.
I've written down some codes below which can only extract each link within the first page.
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
from scrapy.http import FormRequest
import js2xml
import requests
from datetime import datetime
class nnggzySpider(scrapy.Spider):
name = 'nnggzygov'
start_urls = [
'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'
]
base_url = 'https://www.nnggzy.org.cn'
custom_settings = {
'LOG_LEVEL': 'ERROR'
}
def parse(self, response):
_response = response.text
self.data = {}
soup = BeautifulSoup(response.body, 'html.parser')
tags = soup.find_all('a', href=re.compile(r"InfoDetail"))
# 获取翻页参数
__VIEWSTATE = re.findall(r'id="__VIEWSTATE" value="(.*?)" />', _response)
A = __VIEWSTATE[0]
# print(A)
__EVENTTARGET = 'MoreInfoListZbgs1$Pager'
B = __EVENTTARGET
__CSRFTOKEN = re.findall(r'id="__CSRFTOKEN" value="(.*?)" />', _response)
C = __CSRFTOKEN
page_num = re.findall(r'title="转到第(.*?)页"', _response)
max_page = page_num[-1]
content = {
'__VIEWSTATE': A,
'__EVENTTARGET': B,
'__CSRFTOKEN': C,
'page_num': max_page
}
infoid = re.findall(r'InfoID=(.*?)&CategoryNum', _response)
print(infoid)
yield scrapy.Request(url=response.url, callback=self.parse_detail, meta={"data": content})
def parse_detail(self, response):
max_page = response.meta['data']['page_num']
for i in range(2, int(max_page)):
data = {
'__CSRFTOKEN': '{}'.format(response.meta['data']['__CSRFTOKEN']),
'__VIEWSTATE': '{}'.format(response.meta['data']['__VIEWSTATE']),
'__EVENTTARGET': 'MoreInfoListZbgs1$Pager',
'__EVENTARGUMENT': '{}'.format(i),
# '__VIEWSTATEENCRYPTED': '',
# 'txtKey': ''
}
yield scrapy.FormRequest(url=response.url, callback=self.parse, formdata=data, method="POST", dont_filter=True)
Can anyone help me with this?
Looks like the pagination over mentioned website is made by sending POST requests with formdata like:
{
"__CSRFTOKEN": ...,
"__VIEWSTATE": ...,
"__EVENTTARGET": "MoreInfoListZbgs1$Pager",
"__EVENTARGUMENT": page_number,
"__VIEWSTATEENCRYPTED": "",
"txtKey": ""
}
I know this is a year old thread but I am posting the answer for future visitors from Google search.
Your form submission didn't work because there must be some more hidden fields at the bottom of the web page but inside the form. In my case, it is and here's the working submission
# This is the next page link
# <a id="nextId" href="javascript:__doPostBack('MoreInfoListZbgs1$Pager','')"> Next </a>
# This is how the website evaluate the next link
# <script type="text/javascript">
# //<![CDATA[
# var theForm = document.forms['Form1'];
# if (!theForm) {
# theForm = document.Form1;
# }
# function __doPostBack(eventTarget, eventArgument) {
# if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
# theForm.__EVENTTARGET.value = eventTarget;
# theForm.__EVENTARGUMENT.value = eventArgument;
# theForm.submit();
# }
# }
# //]]>
# </script>
# According to above js code, we need to pass in the following arguments:
data = {
'__EVENTTARGET': 'MoreInfoListZbgs1$Pager', # first argument from javascript:__doPostBack('MoreInfoListZbgs1$Pager','') next link
'__EVENTARGUMENT': '', # second argument from javascript:__doPostBack('MoreInfoListZbgs1$Pager','') next link, in my case it is empty
'__VIEWSTATE': response.css('input[name=__VIEWSTATE]::attr("value")').get(),
# These are the more hidden input fields you need to pass in
'__VIEWSTATEGENERATOR': response.css('input[name=__VIEWSTATEGENERATOR]::attr("value")').get(),
'__EVENTVALIDATION': response.css('input[name=__EVENTVALIDATION]::attr("value")').get(),
}
yield scrapy.FormRequest(url=form_action_url_here, formdata=data, callback=self.parse)
I am trying to scrape a forum but can not log in. I checked by chrome dev tools and found that this site does not have any token etc. However, formrequest does not work. Could someone possibly find what am I doing wrong?
class WbcSpider(scrapy.Spider):
'''
yield_dict: mandantory keys
db_handler
meta_dict
error
result
'''
name = 'wbc'
def __init__(self):
super().__init__()
self.start_ric_url_dict = {
'wbc': 'https://hotcopper.com.au/asx/wbc/discussion/?post_view=0'
}
# flags' dict for determing whether keep scraping
self.comment_cont_dict = dict()
self.post_cont_dict = dict()
self.max_old_num = 5
self.stop_date_flag = dp.parse('2018-12-31')
self.scrapy_meta_keys = [
'depth', 'download_timeout', 'download_slot', 'download_latency', '_id'
]
def start_requests(self):
yield scrapy.Request('https://hotcopper.com.au/login/', callback=self.login)
def login(self, response):
yield scrapy.FormRequest.from_response(
response,
formdata={
'login': '***',
'password': '***',
'remember': '1',
'cookie_check': '1',
'tos': '1',
'redirect': 'https://hotcopper.com.au/',
'_xfToken': '',
'_xfResponseType': 'json',
'_xfRequestUri': '/login/'
},
callback=self.after_login, dont_filter=True)
How can skip one iteration of spider if the webpage contains some data?
Page titles:
We have several page title on pages. I skip other data (dates, likes).
page 1 title: 'We like cats' # this title is valid
page 2 title: 'This title contains WORD X...' # this title is not valid (skip it)
page 3 title: 'Best ideas' # this title is valid
Code:
from scrapy.spider import CrawlSpider
class Carflix(CrawlSpider):
name = 'carflix'
allowed_domains = ['sitex.com']
start_urls = ['http://sitex.com/page-1.html',
'http://sitex.com/page-2.html',
'http://sitex.com/page-2.html']
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
if 'WORD X' in pagetitle:
# what need to do that skip adding data if page title contains 'WORD X'
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
yield{
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}
The result should be:
[{
'pagetitle': 'We like cats',
'date': '01/01/2019',
'likes': 200
},
{
'pagetitle': 'Best ideas',
'date': '02/01/2019',
'likes': 100
}]```
Just yield your results under your specified condition:
def parse(self, response):
date = response.xpath('//div[#class="date"]/text()').extract_first()
pagetitle = response.xpath('//div[#class="title"]/text()').extract_first()
likes = response.xpath('//div[#class="likes"]/text()').extract_first()
if not 'WORD X' in pagetitle:
yield {
'pagetitle': pagetitle,
'date': date,
'likes': likes,
}
Crawling the following page: http://graphics.stltoday.com/apps/payrolls/salaries/teachers/detail/25074/ and I'm trying to grab each value from the table (salary, job title, years with district, etc). When I attempt to access these from scrapy shell, they're all displaying when I use response.xpath('//th[#scope="row"]/following-sibling::td[1]/text()').extract() However, when I do this within the crawler, only the first element (district) displays. Any suggestions?
Crawler code (Ideally, each element would go into its own variable for cleaner output:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Spider2(CrawlSpider):
#name of the spider
name = 'stlteacher'
#list of allowed domains
allowed_domains = ['graphics.stltoday.com']
#starting url for scraping
start_urls = ['http://graphics.stltoday.com/apps/payrolls/salaries/teachers/']
rules = [
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/[0-9]+/position/[0-9]+/$']),
follow=True),
Rule(LinkExtractor(
allow=['/apps/payrolls/salaries/teachers/detail/[0-9]+/$']),
callback='parse_item',
follow=True),
]
#setting the location of the output csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/stlteachers3.csv'
}
def parse_item(self, response):
#Remove XML namespaces
response.selector.remove_namespaces()
#Extract article information
url = response.url
name = response.xpath('//p[#class="table__title"]/text()').extract()
district = response.xpath('//th[#scope="row"]/following-sibling::td[1]/text()').extract()
for item in zip(name, district):
scraped_info = {
'url' : url,
'name' : item[0],
'district' : item[1],
}
yield scraped_info
Your zip is a bit confusing there. If you want to crawl the whole table then you need to iterate through table rows and find row name and value.
I got pretty good results with this piece of code:
def parse_item(self, response):
name = response.xpath('//p[#class="table__title"]/text()').extract_first()
item = {
'name': name,
'url': response.url
}
for row in response.xpath('//th[#scope="row"]'):
row_name = row.xpath('text()').extract_first('').lower().strip(':')
row_value = row.xpath('following-sibling::td[1]/text()').extract_first()
item[row_name] = row_value
yield item
This returns:
{
'name': 'Bracht, Nathan',
'url': 'http://graphics.stltoday.com/apps/payrolls/salaries/teachers/detail/25074/',
'district': 'Affton 101',
'school': 'Central Office',
'position': 'Central Office Admin.',
'degree earned': 'Doct',
'salary': '$152,000.00',
'extended contract pay': None,
'extra duty pay': None,
'total pay (all combined)': '$152,000.00',
'years in district': '5',
'years in mo schools': '19',
'multiple position detail': None
}