How I can scrape subpages and merge it with page info?

How I can scrape subpages and merge it with page info? - python

I use scrapy for parsing the page. The page has subpages (categories) from which I also need to get information and combine it all in one element (maybe save the information from the additional pages as a json), which I add to csv. I've tried different options, such as:
requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
Or
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
But neither method works the way I want it to.
For example, I took pages from https://www.webscorer.com/findraces?pg=results (example: https://www.webscorer.com/seriesresult?seriesid=211565 ) and get information from this page. After that, I need to get additional information from category (example: https://www.webscorer.com/seriesresult?seriesid=211565&gender=F ):
example and put all of them in csv. My code now:
class WebscorerSpider(scrapy.Spider):
name = 'webscorer'
allowed_domains = ['webscorer.com']
def start_requests(self):
url = f'https://www.webscorer.com/findraces?pg=results'
yield scrapy.Request(url, callback=self.parse_page)
def parse_page(self, response, **kwargs):
for href in response.css('table.results-table tbody tr a::attr("href")').extract():
url = response.urljoin(href)
url = 'https://www.webscorer.com/seriesresult?seriesid=211565'
yield scrapy.Request(url, callback=self.parse)
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = dict()
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
# requests = scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
yield WebscorerEvent(name=response.css('h1.race-name::text').get(),
source_url=response.request.url,
sport_discipline=response.css('td.spec+td').css('strong::text').get(),
description=response.css('span.regnotes span::text').get(),
hero_image=response.css('p.associated-race-pic img::attr(src)').get(),
start_date=parse_webscorer_date(response.css('p.race-date::text').get()),
location={
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()})
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
# print(item)
item['winner'] = response.css('table.results-table .r-racername span::text').get()
return item

You did yield WebscorerEvent, so you already "dropped" the item before getting the data needed on the next page.
You could do something like:
def parse(self, response: Response, **kwargs):
latlong_match = re.search('lat=(.*)&lng=(.*)', response.css('span#FSrc::text').get())
item = {
"name": response.css('h1.race-name::text').get(),
"source_url": response.request.url,
"sport_discipline": response.css('td.spec+td').css('strong::text').get(),
"description": response.css('span.regnotes span::text').get(),
"hero_image": response.css('p.associated-race-pic img::attr(src)').get(),
"start_date": parse_webscorer_date(response.css('p.race-date::text').get()),
"location": {
"link": f"https://www.google.com/maps/search/?api=1&query={latlong_match.group(1)},{latlong_match.group(2)}",
"description": response.css('td.spec:contains("Location:")+td strong::text').get()
}
}
for href in response.css('table.category-table .category-name').css('a::attr("href")').extract():
url = response.urljoin(href)
yield scrapy.Request(url, meta={'meta_item': item}, callback=self.parse_category)
def parse_category(self, response, **kwargs):
item = response.meta['meta_item']
item['winner'] = response.css('table.results-table .r-racername span::text').get()
yield WebscorerEvent(item)
So in that way you only yield the item in the end, with all data needed.

Related

Access multiple pages with pagination in Scrapy

I have urls with multiple pages. I try to paginate to extract datas from theses urls hut it works only one time (juste one next_page). What's wrong ?
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from rzc_spider.items import AnnonceItem
class AnnonceSpider(scrapy.Spider):
name = 'rzc_results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data("rzc_spider", "json/input/test_tt.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['rzc_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
item["car_number"] = response.css(
"h2.sub::text").extract_first()
for caritem in response.css("div.ad > div[itemtype='https://schema.org/Vehicle']"):
data = AnnonceItem()
#model
data["model"] = caritem.css(
"em.title::text").extract_first()
item['results'].append(data)
yield item
next_page = response.css(
'a.link::attr(href)').extract_first()
if next_page is not None:
url_pagination = 'https://www.websiteexample.com' + next_page
meta = {'item': response.meta['item']}
yield scrapy.Request(url=url_pagination, callback=self.parse, meta=meta)
#ban proxies reaction
def response_is_ban(self, request, response):
return b'banned' in response.body
def exception_is_ban(self, request, exception):
return None
The json file with the url (a sample in this case):
[{
"rzc_url": "https://www.websiteexample.com/model"
}]

Try and check the URL. Sometimes they set traps so only next_page has a absolute URL and another one has a relative URL. Instead of combining url_pagination with next_page use urljoin. Import it
yield scrapy.Request(urljoin(response.url, item), callback=self.parse, meta=meta)

Scrapy to download specific type files

I'm new to scrapy and python, I can able to download all the files but I want to download only specific Type files "EX-10", So that it will download followinh files. ( Ex-10.1, Ex-10.2 to EX-10.99).
My Code
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)
and Scrapy want to check for next pages also... (upto last page), Its not working Fine.
Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#value="Next 40"]',)), callback="parse", follow= True),)
# follow next page links
next_page = response.xpath('.//a[#value="Next 40"]/#href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.sec.gov/cgi-bin/browse-edgar?company=&match=&CIK=&filenum=&State=&Country=&SIC=2834&owner=exclude&Find=Find+Companies&action=getcompany' + next_href
request = scrapy.Request(url=next_page_url)
yield request

Your problem seems to have been solved. The following script should fetch you the required files from that site following every pagination links and downloading those files like the way you wanted to have.
import scrapy, os
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
nextpage = response.css("input[value='Next 40']::attr(onclick)")
if nextpage:
tpage = nextpage.extract_first().split("parent.location=")[1].replace("'","")
nlink = response.urljoin(tpage)
yield scrapy.Request(url=nlink, callback = self.parse)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//tr[td[starts-with(., "EX-")]]/td/a[contains(#href, ".htm") or contains(#href, ".txt")]/#href').extract():
baseLink = response.urljoin(links)
yield scrapy.Request(url = baseLink, callback = self.download_files)
def download_files(self, response):
path = response.url.split('/')[-1]
dirf = r"/home/surukam/scrapy/demo/tutorial/tutorial/Downloads3"
if not os.path.exists(dirf):os.makedirs(dirf)
os.chdir(dirf)
with open(path, 'wb') as f:
f.write(response.body)

You need to use a FilesPipeline, but the one that scrapy provides generates the file name based on the hash of the URL.
If you want a custom file name, you have to make your own FilesPipeline like this:
import scrapy, os
from scrapy.pipelines.files import FilesPipeline
class legco(scrapy.Spider):
name = "sec_gov"
start_urls = ["https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&SIC=2834&owner=exclude&match=&start=120&count=40&hidefilings=0"]
custom_settings = {
'ITEM_PIPELINES': {'myspider.MyFilesPipeline': 1},
'FILES_STORE': '/my/valid/path/',
}
def parse(self, response):
for link in response.xpath('//table[#summary="Results"]//td[#scope="row"]/a/#href').extract():
absoluteLink = response.urljoin(link)
yield scrapy.Request(url = absoluteLink, callback = self.parse_links)
def parse_links(self, response):
for links in response.xpath('//table[#summary="Results"]//a[#id="documentsbutton"]/#href').extract():
targetLink = response.urljoin(links)
yield scrapy.Request(url = targetLink, callback = self.collecting_file_links)
def collecting_file_links(self, response):
for links in response.xpath('//table[contains(#summary,"Document")]//td[#scope="row"]/a/#href').extract():
if links.endswith(".htm") or links.endswith(".txt"):
yield {
'file_urls': [response.urljoin(links)]
}
class MyFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.url.split('/')[-1]

Scrapy multiple next page

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}

I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

ERROR: Spider error processing <GET https://www.imovirtual.com/comprar/apartamento/lisboa/> (referer: None)

I'm trying to create a web crawler (in python, using scrapy) that extracts information from an ad, extract what is on the main page and enter the sub page of the same ad and extract the remaining information, but is giving this error when I run the code. Any suggestion?
import scrapy
class SapoSpider(scrapy.Spider):
name = "imo"
start_urls = ['https://www.imovirtual.com/comprar/apartamento/lisboa/']
def parse(self,response):
for Property in response.css('div.offer-item-details'):
youritem = {
'preco':Property.css('span.offer-item title::text').extract_first(),
'autor':Property.css('li.offer-item-price::text').extract(),
'data':Property.css('li.offer-item-area::text').extract(),
'data_2':Property.css('li.offer-item-price-perm::text').extract()
}
yield scrapy.Request(subpage_link, callback=self.parse_subpage)
# next_page = response.css('li.pager-next a::attr(href)').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
def parse_subpage(self,youritem):
for i in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = {
'info': i.css('ul.main-list::text').extract(),
}
yield youritem

There are a few things to change to make it run:
You have to set subpage_link (It does not seem to be defined)
Request callbacks have only one parameter which is (Scrapy response) so you
should replace parse_subpage(self,youritem) by parse_subpage(self, reponse)
To send your item with the Request you'd better use Request meta parameter which allows you to transfer data from one scrapy response to another. If you replace scrapy.Request(subpage_link, callback=self.parse_subpage) by scrapy.Request(subpage_link, callback=self.parse_subpage, meta={'item': youritem}) you will have access to youritem when scrapy will call parse_subpage by doing response.meta.get('item')
This should work.
def parse(self,response):
for Property in response.css('div.offer-item-details'):
youritem = {
'preco':Property.css('span.offer-item title::text').extract_first(),
'autor':Property.css('li.offer-item-price::text').extract(),
'data':Property.css('li.offer-item-area::text').extract(),
'data_2':Property.css('li.offer-item-price-perm::text').extract()
}
subpage_link = ......
yield scrapy.Request(subpage_link, callback=self.parse_subpage,
meta={'item': youritem})
# next_page = response.css('li.pager-next a::attr(href)').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
def parse_subpage(self, response):
for i in response.css('header[class=offer-item-header] a::attr(href)'):
youritem = response.meta.get('item')
youritem['info'] = i.css('ul.main-list::text').extract()
yield youritem

What are the best practices for calling an external api?

So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?

This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How I can scrape subpages and merge it with page info? - python

Related

Access multiple pages with pagination in Scrapy

Scrapy to download specific type files

Scrapy multiple next page

ERROR: Spider error processing <GET https://www.imovirtual.com/comprar/apartamento/lisboa/> (referer: None)

What are the best practices for calling an external api?

Categories

Resources