Can't scrape next page contents using Scrapy

Can't scrape next page contents using Scrapy - python

I want to scrape the contents from the next pages too but it didn't go to the next page. My code is:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['startech.com.bd/component/processor']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
print ('\n')
print (name)
print (price)
print ('\n')
next_page_url = response.xpath('//*[#class="pagination"]/li/a/#href').extract_first()
# absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url)
I didn't use the urljoin because the next_page_url is giving me the whole url. I also tried the dont_filter=true argument in the yield function which gives me an infinite loop through the 1st page. The message I'm getting from the terminal is [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.startech.com.bd': https://www.startech.com.bd/component/processor?page=2>

This is because your allowed_domains variable is wrong, use allowed_domains = ['www.startech.com.bd'] instead (see the doc).
You can also modify your next page selector in order to avoid going to page one again:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['www.startech.com.bd']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
yield({'name': name, 'price': price})
next_page_url = response.css('.pagination li:last-child a::attr(href)').extract_first()
if next_page_url:
yield scrapy.Request(next_page_url)

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.

Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Extracting next page and setting a break

I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()

You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]

Problems getting next page when scraping with scrapy

I have a scrapy code which doesn't crawl pagination links and i'm stuck.
The source of the page is:
https://www.levenhuk.bg/katalog/teleskopi/?page=1
My code is:
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))

I feel like the problem is simply that you are not specifying a callback in your pagination request. Specify your parse function as callback and that should work. please comment if it still doesn't work.
Edit:
In this case I feel like your logic needs an overhaul. I suggest separating the pagination and item extraction login. Try the following:
def parse(self, response):
self.extract_item(response)
next_page_urls = response.xpath('//*[#class="pagesCount"] [1]//#href').getall()
if next_page_urls is not None:
for url in next_page_urls:
yield scrapy.Request(response.urljoin(url), callback=self.extract_item)
def extract_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
so now the parse function handles pagination and the extract_item function extracts items for every page.
Modify allowed_domains as well as specified by Pasindu.

Change this to :
allowed_domains = ['https://www.levenhuk.bg/']
allowed_domains = ['levenhuk.bg']
You also need to change:
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
This will only work for the first page, for page 2,3,4.., this will extract a link to the first page.
And also add a callback as mentioned by UzairAhmed.

This is a little tricky since usually standard practice is to just check if there is a next page button on a loop until there isn't.
Here's an example since there is no next page button we can figure out the total page count. There will be a duplicate request to page1 though with this method its not the most ideal situation.
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
total_pages = response.css('.pagesCount a::text')[-1].get()
total_pages = int(total_pages)
pages_str = str(total_pages)
for i in range(1, total_pages):
url = 'https://www.levenhuk.bg/katalog/teleskopi/?page={}'.format(pages_str)
yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
def parse_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
Another method of doing this would be to just look at how many pages there are and over ride your start_requests method as follows:
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page={}']
download_delay = 3
def start_requests(self):
for i in range(1, 14):
yield scrapy.Request(self.start_urls[0].format(str(i)), callback=self.parse)

Scrapy Spider not following Request callback using yield

I'm new to scrapy and I cant get my spider to enter parse_votes in code bellow, even though I set it as callback. The others parse methods are working fine, I don't get any ERROR and checked the 'link' variable which has the correct info. HELP?
EDIT - Full code
class DeputadosSpider(scrapy.Spider):
name = "deputies"
allowed_domains = ["camara.leg.br"]
start_urls = ["http://www2.camara.leg.br/deputados/pesquisa"]
def parse(self, response):
sel = Selector(response)
sel_options = sel.xpath('//*[#id="deputado"]/option[position()>1]')
iteration = 1
# get deputies pages
for sel_option in sel_options:
item = DeputiesInfo()
item["war_name"] = sel_option.xpath("text()").extract()
item["link_id"] = sel_option.extract().partition('?')[-1].rpartition('"')[0]
item["page_link"] = 'http://www.camara.leg.br/internet/Deputado/dep_Detalhe.asp?id=' + item["link_id"]
item["id"] = iteration
iteration += 1
# go scrap their page
yield scrapy.Request(item["page_link"], callback=self.parse_deputy, meta={'item': item})
def parse_deputy(self, response):
item = response.meta['item']
sel = Selector(response)
info = sel.xpath('//div[#id="content"]/div/div[1]/ul/li')
# end to fill the data
item["full_name"] = info.xpath("text()").extract_first()
item["party"] = info.xpath("text()").extract()[2].partition('/')[0]
item["uf"] = info.xpath("text()").extract()[2].partition('/')[-1].rpartition('/')[0]
item["legislatures"] = info.xpath("text()").extract()[5]
item["picture"] = sel.xpath('//div[#id="content"]/div/div[1]//img[1]/#src').extract()
# save data to json file
file = open('deputies_info.json', 'a')
line = json.dumps(dict(item)) + ",\n"
file.write(line)
# colect votes info
get_years = sel.xpath('//*[#id="my-informations"]/div[3]/div/ul/li[1]/a[position()<4]')
for get_year in get_years:
vote = VotesInfo()
vote["deputy_id"] = item["id"]
vote["year"] = get_year.xpath("text()").extract_first()
link = get_year.xpath("#href").extract_first()
print(vote["year"])
print(link)
# go to voting pages
yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote})
def parse_votes(self, response):
#vote = response.meta['vote']
print('YYYYYYYYYYYYYUHUL IM IN!!')

Your problem is allowed_domains, because the link you are trying to request in parse_deputy is for example: http://www.camara.gov.br/internet/deputado/RelVotacoes.asp?nuLegislatura=55&nuMatricula=410&dtInicio=01/01/2016&dtFim=30/12/2016
and its domain is camara.gov.br so add it to allowed_domains.
allowed_domains = ["camara.leg.br", "camara.gov.br"]
PS: I ran your code commentingallowed_domains, and parse_votes works perfectly.

I ran your spider and found why it nerver enters parse_votes.
I checked the link in yield scrapy.Request(link, callback=self.parse_votes, meta={'vote': vote}) and found out that it is not in the same domain
The link belongs to the camara.gov.br domain, which does not belong to the allowed_domains = ["camara.leg.br"]
So you need to add this domain to the allowed_domains list.
allowed_domains = ["camara.leg.br", "camara.gov.br"]

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item

By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Can't scrape next page contents using Scrapy - python

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

Extracting next page and setting a break

Problems getting next page when scraping with scrapy

Scrapy Spider not following Request callback using yield

whats wrong with this scrapy spider? scrapes only last url

Categories

Resources