Scrapy - Crawl 4 level pages per item, can not go deep first

Scrapy - Crawl 4 level pages per item, can not go deep first - python

I want to crawl the township directory of China. The website is structured in 4 levels, which are province page, city page, county page, and township page. For example, on the province page, all the provinces are listed. If we click the link of one province, then it takes us to the city page and a list of the cities in that province is displayed.
I want each of my item to be a township. It includes town_name, town_id(gbcode), and corresponding county_name, city_name, prov_name. So the spider should collect information along the way as it goes deeper into the township page. However, my current approach using for loop does not seem to work. There is no problem with prov_name. But city and county name are mostly incorrect, they are always the last city/county in the list of their corresponding page. I think the problem is that the spider does not go deep enough, only go to parse_county request at the end of the loop. But, changing depth priority in the setting does not solve the problem.
---------- Sample Result --------
town_name, year, gbcode, city, province, county
建国门街道办事处,2016,110101008000,市辖区,北京市,延庆区
东直门街道办事处,2016,110101009000,市辖区,北京市,延庆区
和平里街道办事处,2016,110101010000,市辖区,北京市,延庆区
前门街道办事处,2016,110101011000,市辖区,北京市,延庆区
崇文门外街道办事处,2016,110101012000,市辖区,北京市,延庆区
import scrapy
import re
from scrapy.spiders import Spider
from admincode.items import AdmincodeItem
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in self.parse_provincetr(response, response.selector.css(".provincetr")):
yield item
def get_text_href(self, td):
if not td.xpath('a'):
return td.xpath('text()').extract()[0], None
else:
return td.xpath('a/text()').extract()[0], td.xpath('a/#href').extract()[0]
def parse_provincetr(self, response, trs):
year_pattern = re.compile('(tjyqhdmhcxhfdm/)([0-9][0-9][0-9][0-9])')
year = year_pattern.search(response.url).group(2)
for td in trs.xpath('td'):
scraped = {}
scraped['year'] = year
scraped['prov_name'], href = self.get_text_href(td)
url = response.urljoin(href)
yield scrapy.Request(url, callback=self.parse_citytr,
meta={'scraped': scraped})
def parse_2td(self, response, trs, var_name, nextparse):
for tr in trs:
scraped = response.meta['scraped']
scraped[var_name], href = self.get_text_href(tr.xpath('td')[1])
if nextparse:
url = response.urljoin(href)
yield scrapy.Request(url, callback=nextparse, meta={'scraped': scraped})
else:
item = AdmincodeItem()
item['year'] = scraped['year']
item['prov_name'] = scraped['prov_name']
item['city_name'] = scraped['city_name']
item['county_name'] = scraped['county_name']
item['town_name'] = scraped['town_name']
item['gbcode'], href = self.get_text_href(
tr.xpath('td')[0])
yield item
def parse_citytr(self, response):
for city in self.parse_2td(response, response.selector.css(".citytr"), 'city_name', self.parse_countytr):
yield city
def parse_countytr(self, response):
for county in self.parse_2td(response, response.selector.css(".countytr"), 'county_name', self.parse_towntr):
yield county
def parse_towntr(self, response):
for town in self.parse_2td(response, response.selector.css(".towntr"), 'town_name', None):
yield town

I think you just made things a bit complex. This is a simple scraper, what you need to do is pass information from one page to another page using meta. Since meta is a dictionary in memory we need to make sure we create copies of the information for the items to come. For that we use copy.deepcopy. This will make sure data is not overwritten before yielding the items
Below is scraper which does that
class StatsSpider(Spider):
name = 'stats'
allowed_domains = ['stats.gov.cn']
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{}/index.html'.format(year) for year in range(2009, 2010)]
def parse(self, response):
for item in response.css(".provincetr a"):
name = item.xpath("./text()").extract_first().strip()
link = item.xpath("./#href").extract_first().strip()
yield response.follow(link, callback=self.parse_province, meta={'item':{'province':name}})
def parse_province(self, response):
meta = response.meta['item']
for cityrow in response.css(".citytr"):
city_link = cityrow.xpath("./td[2]/a/#href").extract_first()
city_name = cityrow.xpath("./td[2]/a/text()").extract_first()
city_code = cityrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['city_name'] = city_name
meta_new['city_code'] = city_code
yield response.follow(city_link, callback=self.parse_city, meta = {'item':meta_new})
def parse_city(self, response):
meta = response.meta['item']
for countyrow in response.css(".countytr"):
county_link = countyrow.xpath("./td[2]/a/#href").extract_first()
county_name = countyrow.xpath("./td[2]/a/text()").extract_first()
county_code = countyrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['county_name'] = county_name
meta_new['county_code'] = county_code
yield response.follow(county_link, callback=self.parse_county, meta = {"item": meta_new})
def parse_county(self, response):
meta = response.meta['item']
for townrow in response.css(".towntr"):
town_link = townrow.xpath("./td[2]/a/#href").extract_first()
town_name = townrow.xpath("./td[2]/a/text()").extract_first()
town_code = townrow.xpath("./td[1]/a/text()").extract_first()
meta_new = deepcopy(meta)
meta_new['town_name'] = town_name
meta_new['town_code'] = town_code
yield meta_new

Related

Trying to add multiple yields into a single json file using Scrapy

I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback - 'yield scrapy.Request(product_link, callback=self.parse_new_item)'
product_link should be 'https://www.antaira.com/products/10-100Mbps/LNX-500A'
but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries
import scrapy
# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
# Spider class name
class productJumper(scrapy.Spider):
# Name of the spider
name = 'productJumper'
# The domain to be scraped
allowed_domains = ['antaira.com']
# The URLs to be scraped from the domain
start_urls = ['https://www.antaira.com/products/10-100Mbps']
#target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
# First Step: Find every div with the class 'product-container' and step into the links
def parse(self, response):
#product_link = response.urljoin(rel_product_link)
# creating items dictionary
items = AntairaItem()
rel_product_link = response.css('div.center767')
for url in rel_product_link:
rel_product_link = response.xpath('//div[#class="product-container"]//a/#href').get(),
product_link = response.urljoin('rel_product_link'),
items['rel_product_link'] = rel_product_link,
items['product_link'] = product_link
#yield items
# 2nd Step: Return a list of the all products-links that will be scrapped
#yield {
# take the first relative product link
# 'rel_product_link' : rel_product_link,
# 'product_link' : product_link,
#}
yield scrapy.Request(product_link, callback=self.parse_new_item)
# Final Step: Run through each product and Yield the results
def parse_new_item(self, response):
for product in response.css('main.products'):
name = product.css(('h1.product-name::text').strip(' \t\n\r')).get()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn't have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy
from ..items import AntairaItem
class ProductJumper(scrapy.Spider): # classes should be TitleCase
name = 'productJumper'
allowed_domains = ['antaira.com']
start_urls = ['https://www.antaira.com/products/10-100Mbps']
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css(('h1.product-name::text').get().strip()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

How Do I Set Up Pagination correctly?

I'm currently working on a Scrapy code that will extract 3 types of data for each product. I called the data "title, price, and upc". For each product I have made my program able to scrape title and price correctly but i am having trouble scraping for upc since the upc is on another page.
What I want my program to do for each product, is to extract the title and price on the mainpage, then go inside another page to extract UPC code. Once it gets the upc code, I want the program to go to the next product on main page and repeat the same method for the remaining products.
Here is my code.
import scrapy
from scrapy.utils.response import open_in_browser
from ..items import QuotetutorialItem
data={hidden}
headers={hidden}
class BrickseekSpider(scrapy.Spider):
name = 'brickseek1'
allowed_domains = ['brickseek.com']
def start_requests(self):
dont_filter = True
yield scrapy.http.FormRequest(url='https://brickseek.com/login/', headers=headers, formdata=data,
callback=self.parse)
def parse(self, response):
items = QuotetutorialItem()
products = response.css('div.item-list__tile')
for product in products:
title = product.css('.item-list__title span::text').extract()
price = product.css('.item-list__price-column--highlighted .price-formatted__dollars::text').extract()
#another_page = response.css('div.item-list__tile a::attr(href)').get()
#if another_page:
#upc = product.css('div.item-overview__meta-item::text').extract()[6]
#yield response.follow(another_page, callback=self.parse)
items['title'] = title
items['price'] = price
#items['upc'] = upc
yield items

All you need to do is to put your item (after filling title,price) in meta when you visit the next page (assuming you css selectors are correct)
def parse(self, response):
items = QuotetutorialItem()
products = response.css('div.item-list__tile')
for product in products:
item = QuotetutorialItem()
item['title'] = product.css('.item-list__title span::text').extract()
item['price'] = product.css('.item-list__price-column--highlighted .price-formatted__dollars::text').extract()
another_page = response.css('div.item-list__tile a::attr(href)').get()
if another_page:
yield response.follow(another_page, callback=self.parse_upc,meta={'item':item})
else:
yield item
def parse_upc(self,response):
item=response.meta['item']
item['upc'] = product.css('div.item-overview__meta-item::text').extract()[6]
yield item

Scrapy Spider not following Request callback using yield

I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')

Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.

I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]

Scrapy merge subsite-item with site-item

Im trying to scrape details from a subsite and merge with the details scraped with site. I've been researching through stackoverflow, as well as documentation. However, I still cant get my code to work. It seems that my function to extract additional details from the subsite does not work. If anyone could take a look I would be very grateful.
# -*- coding: utf-8 -*-
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapeInfo.items import infoItem
import pyodbc
class scrapeInfo(Spider):
name = "info"
allowed_domains = ["http://www.nevermind.com"]
start_urls = []
def start_requests(self):
#Get infoID and Type from database
self.conn = pyodbc.connect('DRIVER={SQL Server};SERVER=server;DATABASE=dbname;UID=user;PWD=password')
self.cursor = self.conn.cursor()
self.cursor.execute("SELECT InfoID, category FROM dbo.StageItem")
rows = self.cursor.fetchall()
for row in rows:
url = 'http://www.nevermind.com/info/'
InfoID = row[0]
category = row[1]
yield self.make_requests_from_url(url+InfoID, InfoID, category, self.parse)
def make_requests_from_url(self, url, InfoID, category, callback):
request = Request(url, callback)
request.meta['InfoID'] = InfoID
request.meta['category'] = category
return request
def parse(self, response):
hxs = Selector(response)
infodata = hxs.xpath('div[2]/div[2]') # input item path
itemPool = []
InfoID = response.meta['InfoID']
category = response.meta['category']
for info in infodata:
item = infoItem()
item_cur, item_hist = InfoItemSubSite()
# Stem Details
item['id'] = InfoID
item['field'] = info.xpath('tr[1]/td[2]/p/b/text()').extract()
item['field2'] = info.xpath('tr[2]/td[2]/p/b/text()').extract()
item['field3'] = info.xpath('tr[3]/td[2]/p/b/text()').extract()
item_cur['field4'] = info.xpath('tr[4]/td[2]/p/b/text()').extract()
item_cur['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()
# Extract additional information about item_cur from refering site
# This part does not work
if item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract():
url = 'http://www.nevermind.com/info/sub/' + item_cur['field6'] = info.xpath('tr[6]/td[2]/p/b/#href').extract()[0]
request = Request(url, housingtype, self.parse_item_sub)
request.meta['category'] = category
yield self.parse_item_sub(url, category)
item_his['field5'] = info.xpath('tr[5]/td[2]/p/b/text()').extract()
item_his['field6'] = info.xpath('tr[6]/td[2]/p/b/text()').extract()
item_his['field7'] = info.xpath('tr[7]/td[2]/p/b/#href').extract()
item['subsite_dic'] = [dict(item_cur), dict(item_his)]
itemPool.append(item)
yield item
pass
# Function to extract additional info from the subsite, and return it to the original item.
def parse_item_sub(self, response, category):
hxs = Selector(response)
subsite = hxs.xpath('div/div[2]') # input base path
category = response.meta['category']
for i in subsite:
item = InfoItemSubSite()
if (category == 'first'):
item['subsite_field1'] = i.xpath('/td[2]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[2]/text()').extract()
item['subsite_field3'] = i.xpath('/div[5]/a[1]/#href').extract()
else:
item['subsite_field1'] = i.xpath('/tr[10]/td[3]/span/#title').extract()
item['subsite_field2'] = i.xpath('/tr[4]/td[1]/text()').extract()
item['subsite_field3'] = i.xpath('/div[7]/a[1]/#href').extract()
return item
pass
I've been looking at these examples together with a lot of other examples (stackoverflow is great for that!), as well as scrapy documentation, but still unable to understand how I get details send from one function and merged with the scraped items from the original function.
how do i merge results from target page to current page in scrapy?
How can i use multiple requests and pass items in between them in scrapy python

What you are looking here is called request chaining. Your problem is - yield one item from several requests. A solution to this is to chain requests while carrying your item in requests meta attribute.
Example:
def parse(self, response):
item = MyItem()
item['name'] = response.xpath("//div[#id='name']/text()").extract()
more_page = # some page that offers more details
# go to more page and take your item with you.
yield Request(more_page,
self.parse_more,
meta={'item':item})
def parse_more(self, response):
# get your item from the meta
item = response.meta['item']
# fill it in with more data and yield!
item['last_name'] = response.xpath("//div[#id='lastname']/text()").extract()
yield item

Scrapy returns same piece of information 80+ times

New to scrapy and python and running into an issue here.
I'm trying to get the entire list of PS3 games from Metacritic. Here is my code:
class MetacriticSpider(BaseSpider):
name = "metacritic"
allowed_domains = ["metacritic.com"]
max_id = 10
start_urls = [
"http://www.metacritic.com/browse/games/title/ps3?page="
#"http://www.metacritic.com/browse/games/title/xbox360?page=0"
]
def start_requests(self):
for c in lowercase:
for i in range(self.max_id):
yield Request('http://www.metacritic.com/browse/games/title/ps3/{0}?page={1}'.format(c, i), callback = self.parse)
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="product_wrap"]/div')
items = []
for site in sites:
#item = MetacriticItem()
#titles = site.xpath('a/text()').extract()
titles = site.xpath('//div[contains(#class, "basic_stat product_title")]/a/text()').extract()
#cscore = site.xpath('//div[contains(#class, "basic_stat product_score brief_metascore")]/div[1]/text()').extract()
if titles:
item = MetacriticItem()
item['title'] = titles[0].strip()
items.append(item)
return items
For some reason when I check the JSON file, I have 81 instances of each title, and it is starting on
Assassin's Creed: Revelations - Ancestors Character Pack
It should be starting on the first page which is numbered titles, then progressing to the A list, and checking each page in that etc.
Any ideas on why it is doing it this way, I can't see what my problem is

Your xpath should be relative (.//) to the each site:
titles = site.xpath('.//div[contains(#class, "basic_stat product_title")]/a/text()').extract()
Also, change sites selection xpath to (note, no div at the end):
//div[#class="product_wrap"]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy - Crawl 4 level pages per item, can not go deep first - python

Related

Trying to add multiple yields into a single json file using Scrapy

How Do I Set Up Pagination correctly?

Scrapy Spider not following Request callback using yield

Scrapy merge subsite-item with site-item

Scrapy returns same piece of information 80+ times

Categories

Resources