How to scrape the data upto n levels using Scrapy - python

I am new to scrapy and this is my first try in web scraping. Structure of the webpage fro which I am trying to scrape is following:
level 0: Main company URL ---> level 1: several associated company URLs ----> level 2: each associated company URL in level 1 has many URLs linked ---> ... upto level n
Right now I can scrape data upto level 1. But I want to do it upto n th level recursively. There should be a control like max_depth upto which I want to scrape.
I can not figure out how to do it.
Here is my spider which I wrote so far:
import scrapy
from ..items import *
class NodeSpider(scrapy.Spider):
name = 'nodes'
start_urls = ['https://www.zaubacorp.com/companysearchresults/DOIT-']
base_url = 'https://www.zaubacorp.com/'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def parse(self, response):
search_links = response.xpath('//table[#id="results"]/tr/td/a[contains(#href,"company/DOIT-URBAN")]/#href').getall()
page_list = search_links[1:]
#url = search_links.pop(0)
check_list = []
for url in search_links:
print("func 1")
yield response.follow(url=url, callback=self.parse_doit,meta={'page_list':page_list,
'check_list':check_list
})
def parse_doit(self, response):
print("func 2")
check_list = response.meta['check_list']
lnk = MainLink()
lnk['name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
lnk['url'] = response.url
lnk['address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
lnk['email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
lnk['director1'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
lnk['director2'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
dir1_same_co_list = response.xpath('//*[#id="accordion1"]/table[1]//td//p/a/#href').getall()
dir2_same_co_list = response.xpath('//*[#id="accordion2"]/table[1]//td//p/a/#href').getall()
co_list = dir1_same_co_list + list(set(dir2_same_co_list)-set(dir1_same_co_list))
dir_same_co_list = list(set(co_list)-set(check_list))
check_list = check_list + list(set(dir_same_co_list)-set(check_list))
page_list = response.meta['page_list']
if dir1_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
def parse_level_2(self,response):
print("func 3")
lnk = response.meta['name']
lnk = response.meta['url']
lnk = response.meta['address']
lnk = response.meta['email']
lnk = response.meta['director1']
lnk = response.meta['director2']
page_list = response.meta['page_list']
#next_page = response.meta['next_page']
level_2 = SecondaryLink()
try:
lnk['Company_Details_W_Same_Directors']
except:
lnk['Company_Details_W_Same_Directors'] = []
#for sub_link in dir1_same_co_list:
level_2['Co_Name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
level_2['Co_url'] = response.url
level_2['Address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
level_2['Email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
level_2['First_Director'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
level_2['Second_Director'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
lnk['Company_Details_W_Same_Directors'].append(level_2)
dir_same_co_list = response.meta['dir_same_co_list']
print("===== start reading co list =====")
if dir_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
print("co list",len(dir_same_co_list))
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
else:
if page_list:
print("next page loop")
next_page = page_list.pop(0)
next_page_url = next_page
yield response.follow(url=next_page_url, callback=self.parse_doit, meta={'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'next_page':next_page,
'page_list':page_list})
else:
yield lnk
and the items.py is following:
class MainLink(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
address = scrapy.Field()
email = scrapy.Field()
director1 = scrapy.Field()
Company_Details_W_Same_Directors = scrapy.Field()
director2 = scrapy.Field()
pass
class SecondaryLink(scrapy.Item):
Co_Name = scrapy.Field()
Co_url = scrapy.Field()
Address = scrapy.Field()
Email = scrapy.Field()
First_Director = scrapy.Field()
Second_Director = scrapy.Field()
pass ```
Help is much appreciated

You can make use of the DEPTH_LIMIT in scrapy. Please see https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit

Related

when I go to scrapy to convert my web scraping data to csv! No matter how many rows I have. In just one row, the data of all rows is being inserted

import scrapy
from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
items = AmazondawinItem()
name = response.css('.a-size-medium::text').extract()
try:
old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
except:
old_price = None
price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
try:
review = response.css('.s-link-style .s-underline-text::text').extract()
except:
review = None
imagelink = response.css('.s-image::attr(src)').extract()
items['name'] = name
items['old_price'] = old_price
items['price'] = price
items['review'] = review
items['imagelink'] = imagelink
# description =
# ram =
# brand =
# cpu_model =
yield items
Here when I go to scrapy to convert my web scraping data to csv file or any file! No matter how many rows I have. In just one row, the data of all rows is being inserted. or import. Suppose, I have 200 rows in 1 column. But I am getting 200 rows of data in one row.
It's because you're yielding all the items instead of yielding each item separately.
A not so nice solution:
import scrapy
# from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
# items = AmazondawinItem()
name = response.css('.a-size-medium::text').extract()
try:
old_price = response.css('.a-spacing-top-micro .a-text-price span::text').extract()
except:
old_price = None
price = response.css('.a-spacing-top-micro .a-price-whole::text').extract()
try:
review = response.css('.s-link-style .s-underline-text::text').extract()
except:
review = None
imagelink = response.css('.s-image::attr(src)').extract()
# items = dict()
# items['name'] = name
# items['old_price'] = old_price
# items['price'] = price
# items['review'] = review
# items['imagelink'] = imagelink
items = dict()
for (items['name'], items['old_price'], items['price'], items['review'], items['imagelink']) in zip(name, old_price, price, review, imagelink):
yield items
# description =
# ram =
# brand =
# cpu_model =
# yield items
A better solution:
Remove the try except, get() function will return none if no value was found. It's better not to use it in spiders anyway.
Get the items one by one.
Just replace the dict part with your item, just make sure it's inside the loop.
import scrapy
# from ..items import AmazondawinItem
class AmazonspiderSpider(scrapy.Spider):
name = 'amazon'
pagenumber = 3
allowed_domains = ['amazon.com']
start_urls = [
'https://www.amazon.com/s?k=laptop&i=computers&crid=27GFGJVF4KNRP&sprefix=%2Ccomputers%2C725&ref=nb_sb_ss_recent_1_0_recent'
]
def parse(self, response):
for row in response.css('div.s-result-list div.s-result-item.s-asin'):
# items = AmazondawinItem()
items = dict()
items['name'] = row.css('.a-size-medium::text').get()
items['old_price'] = row.css('.a-spacing-top-micro .a-text-price span::text').get()
items['price'] = response.css('.a-spacing-top-micro .a-price-whole::text').get()
items['review'] = row.css('.s-link-style .s-underline-text::text').get()
items['imagelink'] = row.css('.s-image::attr(src)').get()
yield items
# description =
# ram =
# brand =
# cpu_model =
# yield items

Why is scrapy returning only partial results?

I am relatively new to python and scrapy.
I am trying to scrape the job portal https://www.jobs.ch/de/. Currently I start with https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/.
At the moment, the scraper works fine, but is not returning all jobresults. Out of 24 results per page, scrapy is returning alternating amount of results (tested with 2 pages: 21/24 and 23/24). I checked if the css path is different for the missing results, but they are identical. Anybody an idea why I don't get all results? Would really appreciate all suggestions.
import scrapy
from jobscraping.items import JobscrapingItem
class GetdataSpider(scrapy.Spider):
name = 'getdata5'
start_urls = ['https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/']
def parse(self, response):
yield from self.scrape(response)
next_page = response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.itnOWY > a.sc-fznxsB.fvMaWZ.Link-sc-1vy3ms6-1.fvbIfL:last-child').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def scrape(self, response):
for add in response.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd'):
item = JobscrapingItem()
addpage = response.urljoin(add.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd a::attr(href)').get(default='not-found'))
item['addlink'] = addpage
item['Position'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.VacancySerpItem___StyledText-qr45cp-6.gHnsfC::text').get(default='not-found')
item['Company'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY > strong::text').get(default='not-found')
item['Location'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY::text').get(default='not-found')
request1 = scrapy.Request(addpage, callback=self.get_addinfos)
request1.meta['item'] = item
yield request1
def get_addinfos(self, response):
for details in response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.VacancyDetailHead__StyledVacancyDetailHead-sc-14lkltl-0.VacancyDetailHead___StyledStyledVacancyDetailHead-sc-14lkltl-1.deEQGn'):
item = response.meta['item']
companypage = response.urljoin(details.css('div.sc-AxiKw.XkVWn > span > div > a::attr(href)').get(default='not-found'))
item['companylink'] = companypage
item['Date'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(1) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Rank'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(2) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Workload'] = details.css('span.sc-fzqNJr.Badge-sc-88vuku-0.dCIQfi::text').get(default='not-found')
request2 = scrapy.Request(companypage, callback=self.get_companyinfos)
request2.meta['item'] = item
yield request2
def get_companyinfos(self, response):
item = response.meta['item']
item['Industry'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.leHDqM::text').get(default='not-found')
item['Open_jobs'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.fMPCQO > span::text').get(default='not-found')
item['Employees'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.GqJfV > span::text').get(default='not-found')
item['Rating_overall'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.gKcdxd::text').get(default='not-found')
item['Rating_detailed'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.hVUXAg::text').getall()
item['Rating_numbers'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.kxNaWG::text').get(default='not-found')
yield item
The items.py file:
import scrapy
class JobscrapingItem(scrapy.Item):
# define the fields for your item here like:
addlink = scrapy.Field()
Position = scrapy.Field()
Company = scrapy.Field()
Location = scrapy.Field()
Date = scrapy.Field()
Rank = scrapy.Field()
Workload = scrapy.Field()
companylink = scrapy.Field()
Industry = scrapy.Field()
Open_jobs = scrapy.Field()
Employees = scrapy.Field()
Rating_overall = scrapy.Field()
Rating_detailed = scrapy.Field()
Rating_numbers = scrapy.Field()
I found the mistake in my code. Since some of the companies posted more than one posting, the scraper removed duplicates. I set dont_filter=True and this resolved the issue. I also included an if statement for the postings which do not have a companylink, so the scraper yields these items before continuing to crawl the companypage.

Xpath selection only returns first response result

I'm still new to scrapy. When trying to read data from quotes.toscrape, I don't get any content back when using xpath selectors. As soon as I use css selectors everything works as intended. I just can't find the error even though the example is super simple.
quotes.py
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selektor
item['author_name'] = quote.xpath('//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
items.py
import scrapy
from scrapy.loader import ItemLoader
class QuotesLoaderItem(scrapy.Item):
# define the fields for your item here like:
author_name = scrapy.Field()
quote_text = scrapy.Field()
author_link = scrapy.Field()
tags = scrapy.Field()
Result
author_name,quote_text,author_link,tags
Albert Einstein,“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,/author/Albert-Einstein,change
Albert Einstein, ...
...
(20 times)
thank you for your commitment
I use a selector object instead of a respons object and therefore the syntax has to look like this.
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selector
item['author_name'] = quote.xpath('.//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('.//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('.//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('.//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('.//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)

Scrapy trouble : UnboundLocalError: local variable referenced before assignment

I'm trying to enhance my skill in webscraping but I'm stuck with my script. I want to scrape some information on Amazon.
Here's my script so far :
import scrapy
from ..items import AmazontutorialItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon'
page_number = 2
start_urls = ['https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0']
def parse(self, response):
items = AmazontutorialItem()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.sg-col-12-of-28 span.a-size-base+ .a-size-base::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole::text').extract()
product_imagelink = response.css('.s-image::attr(src)').extract()
items['product_name'] = product_name
items['product_author'] = product_author
items['product_price'] = product_price
items['product_imagelink'] = product_imagelink
yield items
next_page = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A283155%2Cn%3A1000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&page=' + str(AmazonSpiderSpider.page_number) + '&fst=as%3Aoff&qid=1606229780&rnid=1250225011&ref=sr_pg_2'
if AmazonSpiderSpider.page_number <= 3:
AmazonSpiderSpider += 1
yield response.follow(next_page, callback = self.parse)
But I get this error :
UnboundLocalError: local variable 'AmazonSpiderSpider' referenced before assignment
I don't understand, I never had this error before, even with webscraping.
Any ideas ? Thanks.
You are trying to access page_number from the class AmazonSpiderSpider inside the class itself. You are trying to do this with AmazonSpiderSpider.page_number, which will most certainly fail. What you were intending to do was probably access self.page_number.
The following should fix your issue:
import scrapy
from ..items import AmazontutorialItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon'
page_number = 2
start_urls = ['https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0']
def parse(self, response):
items = AmazontutorialItem()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.sg-col-12-of-28 span.a-size-base+ .a-size-base::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole::text').extract()
product_imagelink = response.css('.s-image::attr(src)').extract()
items['product_name'] = product_name
items['product_author'] = product_author
items['product_price'] = product_price
items['product_imagelink'] = product_imagelink
yield items
next_page = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A283155%2Cn%3A1000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&page=' + str(self.page_number) + '&fst=as%3Aoff&qid=1606229780&rnid=1250225011&ref=sr_pg_2'
if self.page_number <= 3:
self.page_number += 1
yield response.follow(next_page, callback = self.parse)

Create a static column item

I have a simple spider that crawls local obituaries. The code works perfectly until I try to add two static columns. All I want to do is add the date I pulled the information (pull item) and the state in which it was pulled (state item). It's a self loading page so when I add the pull date, I only get the first 10 results (or only the first page). If I add just the state, I only get two results. When I remove both, I get all 40+ results.
I did # lines that aren't working properly:
Item.py file:
import scrapy
class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
#pull = scrapy.Field()
#state = scrapy.Field()
spider file:
import scrapy
import time
from al.items import AlItem
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browse?type=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
#pull = time.strftime("%m/%d/%Y")
#state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
#new_item['pull'] = pull
#new_item["state"] = state
yield new_item
I explain why:
if you paste in here for item in zip(name, link, obit, news): pull & state, then you will get the number of iterations equal 2 because state = "AL" - string variable. ZIP function get from state two chars and set iteration = 2 for all arguments in loop. zip gets the smallest numb from arguments for iteration. as with the date 01/01/2001 - 10 chars. (will iterations equal 10)
WILL WORKING:
`class AlItem(scrapy.Item):
name = scrapy.Field()
link = scrapy.Field()
obit = scrapy.Field()
news = scrapy.Field()
pull = scrapy.Field()
state = scrapy.Field()`
class AlabamaSpider(scrapy.Spider):
name = 'alabama'
allowed_domains = ['legacy.com']
start_urls = ['http://www.legacy.com/obituaries/annistonstar/browsetype=paid&page=20']
def parse(self, response):
name = response.xpath('//a[#class="NonMobile"]/p[#class="obitName"]/text()').extract()
link = response.xpath('//div[#class="RightColumn"]//a[#class="ObituaryButton"]/#href').extract()
obit = response.xpath('//div[#class="NameAndLocation"]/p[#class="obitText"]/text()').extract()
news = response.xpath('//div[#class="PublishedLine publishedLine"]/span/text()').extract()
pull = time.strftime("%m/%d/%Y")
state = "AL"
for item in zip(name, link, obit, news): #removed 'pull, state'
new_item = AlItem()
new_item['name'] = item[0]
new_item['link'] = item[1]
new_item['obit'] = item[2]
new_item['news'] = item[3]
new_item['pull'] = pull
new_item["state"] = state
yield new_item

Categories

Resources