I'm trying to enhance my skill in webscraping but I'm stuck with my script. I want to scrape some information on Amazon.
Here's my script so far :
import scrapy
from ..items import AmazontutorialItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon'
page_number = 2
start_urls = ['https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0']
def parse(self, response):
items = AmazontutorialItem()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.sg-col-12-of-28 span.a-size-base+ .a-size-base::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole::text').extract()
product_imagelink = response.css('.s-image::attr(src)').extract()
items['product_name'] = product_name
items['product_author'] = product_author
items['product_price'] = product_price
items['product_imagelink'] = product_imagelink
yield items
next_page = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A283155%2Cn%3A1000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&page=' + str(AmazonSpiderSpider.page_number) + '&fst=as%3Aoff&qid=1606229780&rnid=1250225011&ref=sr_pg_2'
if AmazonSpiderSpider.page_number <= 3:
AmazonSpiderSpider += 1
yield response.follow(next_page, callback = self.parse)
But I get this error :
UnboundLocalError: local variable 'AmazonSpiderSpider' referenced before assignment
I don't understand, I never had this error before, even with webscraping.
Any ideas ? Thanks.
You are trying to access page_number from the class AmazonSpiderSpider inside the class itself. You are trying to do this with AmazonSpiderSpider.page_number, which will most certainly fail. What you were intending to do was probably access self.page_number.
The following should fix your issue:
import scrapy
from ..items import AmazontutorialItem
class AmazonSpiderSpider(scrapy.Spider):
name = 'amazon'
page_number = 2
start_urls = ['https://www.amazon.com/s?bbn=1&rh=n%3A283155%2Cn%3A%211000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&fst=as%3Aoff&qid=1606224210&rnid=1250225011&ref=lp_1_nr_p_n_publication_date_0']
def parse(self, response):
items = AmazontutorialItem()
product_name = response.css('.a-color-base.a-text-normal::text').extract()
product_author = response.css('.sg-col-12-of-28 span.a-size-base+ .a-size-base::text').extract()
product_price = response.css('.a-spacing-top-small .a-price-whole::text').extract()
product_imagelink = response.css('.s-image::attr(src)').extract()
items['product_name'] = product_name
items['product_author'] = product_author
items['product_price'] = product_price
items['product_imagelink'] = product_imagelink
yield items
next_page = 'https://www.amazon.com/s?i=stripbooks&bbn=1&rh=n%3A283155%2Cn%3A1000%2Cn%3A1%2Cp_n_publication_date%3A1250226011&dc&page=' + str(self.page_number) + '&fst=as%3Aoff&qid=1606229780&rnid=1250225011&ref=sr_pg_2'
if self.page_number <= 3:
self.page_number += 1
yield response.follow(next_page, callback = self.parse)
Related
As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.
Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
I am new to scrapy and this is my first try in web scraping. Structure of the webpage fro which I am trying to scrape is following:
level 0: Main company URL ---> level 1: several associated company URLs ----> level 2: each associated company URL in level 1 has many URLs linked ---> ... upto level n
Right now I can scrape data upto level 1. But I want to do it upto n th level recursively. There should be a control like max_depth upto which I want to scrape.
I can not figure out how to do it.
Here is my spider which I wrote so far:
import scrapy
from ..items import *
class NodeSpider(scrapy.Spider):
name = 'nodes'
start_urls = ['https://www.zaubacorp.com/companysearchresults/DOIT-']
base_url = 'https://www.zaubacorp.com/'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def parse(self, response):
search_links = response.xpath('//table[#id="results"]/tr/td/a[contains(#href,"company/DOIT-URBAN")]/#href').getall()
page_list = search_links[1:]
#url = search_links.pop(0)
check_list = []
for url in search_links:
print("func 1")
yield response.follow(url=url, callback=self.parse_doit,meta={'page_list':page_list,
'check_list':check_list
})
def parse_doit(self, response):
print("func 2")
check_list = response.meta['check_list']
lnk = MainLink()
lnk['name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
lnk['url'] = response.url
lnk['address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
lnk['email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
lnk['director1'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
lnk['director2'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
dir1_same_co_list = response.xpath('//*[#id="accordion1"]/table[1]//td//p/a/#href').getall()
dir2_same_co_list = response.xpath('//*[#id="accordion2"]/table[1]//td//p/a/#href').getall()
co_list = dir1_same_co_list + list(set(dir2_same_co_list)-set(dir1_same_co_list))
dir_same_co_list = list(set(co_list)-set(check_list))
check_list = check_list + list(set(dir_same_co_list)-set(check_list))
page_list = response.meta['page_list']
if dir1_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
def parse_level_2(self,response):
print("func 3")
lnk = response.meta['name']
lnk = response.meta['url']
lnk = response.meta['address']
lnk = response.meta['email']
lnk = response.meta['director1']
lnk = response.meta['director2']
page_list = response.meta['page_list']
#next_page = response.meta['next_page']
level_2 = SecondaryLink()
try:
lnk['Company_Details_W_Same_Directors']
except:
lnk['Company_Details_W_Same_Directors'] = []
#for sub_link in dir1_same_co_list:
level_2['Co_Name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
level_2['Co_url'] = response.url
level_2['Address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
level_2['Email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
level_2['First_Director'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
level_2['Second_Director'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
lnk['Company_Details_W_Same_Directors'].append(level_2)
dir_same_co_list = response.meta['dir_same_co_list']
print("===== start reading co list =====")
if dir_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
print("co list",len(dir_same_co_list))
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
else:
if page_list:
print("next page loop")
next_page = page_list.pop(0)
next_page_url = next_page
yield response.follow(url=next_page_url, callback=self.parse_doit, meta={'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'next_page':next_page,
'page_list':page_list})
else:
yield lnk
and the items.py is following:
class MainLink(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
address = scrapy.Field()
email = scrapy.Field()
director1 = scrapy.Field()
Company_Details_W_Same_Directors = scrapy.Field()
director2 = scrapy.Field()
pass
class SecondaryLink(scrapy.Item):
Co_Name = scrapy.Field()
Co_url = scrapy.Field()
Address = scrapy.Field()
Email = scrapy.Field()
First_Director = scrapy.Field()
Second_Director = scrapy.Field()
pass ```
Help is much appreciated
You can make use of the DEPTH_LIMIT in scrapy. Please see https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit
I am relatively new to python and scrapy.
I am trying to scrape the job portal https://www.jobs.ch/de/. Currently I start with https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/.
At the moment, the scraper works fine, but is not returning all jobresults. Out of 24 results per page, scrapy is returning alternating amount of results (tested with 2 pages: 21/24 and 23/24). I checked if the css path is different for the missing results, but they are identical. Anybody an idea why I don't get all results? Would really appreciate all suggestions.
import scrapy
from jobscraping.items import JobscrapingItem
class GetdataSpider(scrapy.Spider):
name = 'getdata5'
start_urls = ['https://www.jobs.ch/de/stellenangebote/administration-hr-consulting-ceo/']
def parse(self, response):
yield from self.scrape(response)
next_page = response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.itnOWY > a.sc-fznxsB.fvMaWZ.Link-sc-1vy3ms6-1.fvbIfL:last-child').attrib['href']
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def scrape(self, response):
for add in response.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd'):
item = JobscrapingItem()
addpage = response.urljoin(add.css('div.sc-AxiKw.VacancySerpItem__ShadowBox-qr45cp-0.hqhfbd a::attr(href)').get(default='not-found'))
item['addlink'] = addpage
item['Position'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.VacancySerpItem___StyledText-qr45cp-6.gHnsfC::text').get(default='not-found')
item['Company'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY > strong::text').get(default='not-found')
item['Location'] = add.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.YSULY::text').get(default='not-found')
request1 = scrapy.Request(addpage, callback=self.get_addinfos)
request1.meta['item'] = item
yield request1
def get_addinfos(self, response):
for details in response.css('div.sc-AxiKw.Flex-sc-8fidy7-0.VacancyDetailHead__StyledVacancyDetailHead-sc-14lkltl-0.VacancyDetailHead___StyledStyledVacancyDetailHead-sc-14lkltl-1.deEQGn'):
item = response.meta['item']
companypage = response.urljoin(details.css('div.sc-AxiKw.XkVWn > span > div > a::attr(href)').get(default='not-found'))
item['companylink'] = companypage
item['Date'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(1) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Rank'] = details.css('div.sc-AxiKw.bFWxot > span:nth-child(2) > span.sc-fzqNJr.cPhlSZ::text').get(default='not-found')
item['Workload'] = details.css('span.sc-fzqNJr.Badge-sc-88vuku-0.dCIQfi::text').get(default='not-found')
request2 = scrapy.Request(companypage, callback=self.get_companyinfos)
request2.meta['item'] = item
yield request2
def get_companyinfos(self, response):
item = response.meta['item']
item['Industry'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.leHDqM::text').get(default='not-found')
item['Open_jobs'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.fMPCQO > span::text').get(default='not-found')
item['Employees'] = response.css('div.sc-AxiKw.bFisst > div > ul > li.sc-fznyAO.Li-sc-1gb2r8a-0.GqJfV > span::text').get(default='not-found')
item['Rating_overall'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.gKcdxd::text').get(default='not-found')
item['Rating_detailed'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.hVUXAg::text').getall()
item['Rating_numbers'] = response.css('span.sc-fzqNJr.Text__span-jiiyzm-8.Text-jiiyzm-9.kxNaWG::text').get(default='not-found')
yield item
The items.py file:
import scrapy
class JobscrapingItem(scrapy.Item):
# define the fields for your item here like:
addlink = scrapy.Field()
Position = scrapy.Field()
Company = scrapy.Field()
Location = scrapy.Field()
Date = scrapy.Field()
Rank = scrapy.Field()
Workload = scrapy.Field()
companylink = scrapy.Field()
Industry = scrapy.Field()
Open_jobs = scrapy.Field()
Employees = scrapy.Field()
Rating_overall = scrapy.Field()
Rating_detailed = scrapy.Field()
Rating_numbers = scrapy.Field()
I found the mistake in my code. Since some of the companies posted more than one posting, the scraper removed duplicates. I set dont_filter=True and this resolved the issue. I also included an if statement for the postings which do not have a companylink, so the scraper yields these items before continuing to crawl the companypage.
I'm still new to scrapy. When trying to read data from quotes.toscrape, I don't get any content back when using xpath selectors. As soon as I use css selectors everything works as intended. I just can't find the error even though the example is super simple.
quotes.py
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selektor
item['author_name'] = quote.xpath('//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
items.py
import scrapy
from scrapy.loader import ItemLoader
class QuotesLoaderItem(scrapy.Item):
# define the fields for your item here like:
author_name = scrapy.Field()
quote_text = scrapy.Field()
author_link = scrapy.Field()
tags = scrapy.Field()
Result
author_name,quote_text,author_link,tags
Albert Einstein,“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,/author/Albert-Einstein,change
Albert Einstein, ...
...
(20 times)
thank you for your commitment
I use a selector object instead of a respons object and therefore the syntax has to look like this.
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selector
item['author_name'] = quote.xpath('.//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('.//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('.//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('.//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('.//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
I wanted to scrape the information for the following pages, however, the code only allows me to scrape the information from the first page.
My code is as follows:
# -*- coding: utf-8 -*-
import scrapy
from ..items import PropertyItem
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.com']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
item = PropertyItem ()
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
That's all about your allowed_domains (but you need to fix your indent too). Also I'm sure that you want to define your item inside your loop:
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.my']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item = PropertyItem ()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page:
yield response.follow(next_page, callback = self.parse)
maybe due to indent?
try change:
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
to
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)