Getting to the next page using scrapy - python

I am trying to make a web scraper but I'm unable to get the link of the next page. I have tried some combinations but none of them work. The tutorial on scrapy.org has a simpler format so it doesn't solve my problem
The site I'm scraping has the following layout:
<nav class="nav_class">
<a class="class_1" href="1.html">
<a class="class_2" href="2.html">
<a class="class_3" href="3.html">
I want to get the 3.html link using css selectors
import scrapy
class MySpider(scrapy.Spider):
name = "flip_spider"
def start_requests(self):
urls = [
"https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=1",
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# page_id=response.url.split("=")[-1]
phone_details = response.css("div._1-2Iqu.row")
for ph in phone_details:
phone = ph.css("div._3wU53n::text").get()
rating = ph.css("div.hGSR34::text").get()
price = ph.css("div._1vC4OE._2rQ-NK::text").get()
yield{
"name": phone,
"rating": rating,
"price": price,
}
final = "https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=6"
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
# ^This is the line I need help with
if next_page_id is not final:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)
It only scrapes the first page and then stops

Change you code to this and it will work
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
if next_page_id:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)

Related

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)
Save items['url'] = response.url in the parse_author function.

Scrapy multiple next page

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}
I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

how to scrape <ul> <li> <a>

I newbie in using scrappy. I want to scrape link in this website harga-hp . in this element like I share the picture
when I click on xiaomi it will link to the xiaomi page and then I will scrape the price and the name . can someone help me to fix this code.
import scrapy
from handset.items import HandsetItem
class HandsetpriceSpider(scrapy.Spider):
name = 'handsetprice'
start_urls = ['http://id.priceprice.com/harga-hp/']
def parse(self, response):
urls = response.css('ul.maker > a::attr(href)').extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
next_page_url = response.css('li.last > a::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'Name' : response.css('li.name a::text').extract_first(),
'Price' : response.css('.newPice::text').extract_first(),
}
and the items.py :
import scrapy
from scrapy.item import Item, Field
class HandsetItem(scrapy.Item):
Name =scrapy.Field()
Price =scrapy.Field()
Your css selector for 'urls' needs to path 'ul > li > a', just like in the topic of your question.
You also spelled 'newPrice' incorrectly in parse_details(), which will bubble up after you fix the urls selector.

Scrapy _ How to append / delete text to listing URL

I'm new to Python and Scrapy. I'm trying to create a spider to scrape: https://www.festicket.com/festivals/
I've managed to get the spider working, the problem is that some URLs are like so:
https://www.festicket.com/festivals/electric-daisy-carnival-edc-las-vegas/2018/
and some URLS have: /shop/#ticket appended to them which is stoping the spider from crawling the listing page.
My question is, is there some way that if the spider finds a URL with /shop/#ticket it simple deletes the /shop/#ticket but keeps the rest of the URL???
My code so far is below:
import scrapy
class AuthorsSpider(scrapy.Spider):
name = "festicket"
start_urls = ['https://www.festicket.com/festivals/']
npages = 20
# This mimics getting the pages using the next button.
for i in range(2, npages + 2):
start_urls.append("https://www.festicket.com/festivals/?page=" + str(i) + "")
#Scrape and follow listings
def parse(self, response):
urls = response.xpath(
"//h3[#class='festival-title heading-3ry notranslate']//#href").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
def parse_details(self, response):
yield {
'title': response.xpath("//h1[#class='sc-jzJRlG gbLQoU']/text()").extract_first(),
'festival_url': response.xpath("//meta[#property='og:url']/#content").extract_first(),
'location': response.xpath("//ul[contains(#class,'styles__StyledList')][1]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
'address': response.xpath("//div[#class='sc-gzVnrw bpJeJY'][2]/section[#class='sc-gZMcBi gDrvBk']/div/p[#class='sc-chPdSV hifsJb']/descendant::text()").extract_first(),
'date': response.xpath("//ul[contains(#class,'styles__StyledList')][1]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
'genre1': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
'genre2': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
'genre3': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][3]/descendant::text()").extract_first(),
'subtitle2': response.xpath( "//span[#class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
'subtitle1': response.xpath("//h2[#class='sc-cSHVUG gCeeYI']/descendant::text()").extract_first(),
'para1': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[1]/descendant::text()").extract_first(),
'para2': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[2]/descendant::text()").extract_first(),
'para3': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[3]/descendant::text()").extract_first(),
'flyer': response.xpath("//img[contains(#class,'styles__Artwork')]/#src").extract_first(),
'subtitle2': response.xpath("//span[#class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
'banner_image_1': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][1]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
'banner_image_2': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][2]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
'banner_image_3': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][3]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
}
You need to change this part:
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
to:
for url in urls:
if "/shop/#ticket" in url:
next
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
UPDATE
If you want to replace "/shop/#ticket" at the end of an URL:
for url in urls:
url = re.sub( r'/shop/#ticket$', "", url )
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)

Scrapy returns repeated out of order results when using a for loop, but not when going link by link

I am attempting to use Scrapy to crawl a site. Here is my code:
import scrapy
class ArticleSpider(scrapy.Spider):
name = "article"
start_urls = [
'http://www.irna.ir/en/services/161',
]
def parse(self, response):
for linknum in range(1, 15):
next_article = response.xpath('//*[#id="NewsImageVerticalItems"]/div[%d]/div[2]/h3/a/#href' % linknum).extract_first()
next_article = response.urljoin(next_article)
yield scrapy.Request(next_article)
for text in response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]'):
yield {
'article': text.xpath('./text()').extract()
}
for tag in response.xpath('//*[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_bodytext"]'):
yield {
'tag1': tag.xpath('./div[3]/p[1]/a/text()').extract(),
'tag2': tag.xpath('./div[3]/p[2]/a/text()').extract(),
'tag3': tag.xpath('./div[3]/p[3]/a/text()').extract(),
'tag4': tag.xpath('./div[3]/p[4]/a/text()').extract()
}
yield response.follow('http://www.irna.ir/en/services/161', callback=self.parse)
But this returns in the JSON a weird mixture of repeated items, out of order and often skipping links: https://pastebin.com/LVkjHrRt
However, when I set linknum to a single number, the code works fine.
Why is iterating changing my results?
As #TarunLalwani already stated, your current approach is not right. Basically you should:
In parse method, extract links to all articles on a page and yield requests for scraping them with a callback named e.g. parse_article.
Still in parse method, check that button for loading more articles is present and if so, yield a request for URL of a pattern http://www.irna.ir/en/services/161/pageN. (This can be found in browser's developer tools under XHR requests on network tab.)
Define parse_article method where you extract the article text and tags from details page and finally yield it as item.
Below is the final spider:
import scrapy
class IrnaSpider(scrapy.Spider):
name = 'irna'
base_url = 'http://www.irna.ir/en/services/161'
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'page_number': 1})
def parse(self, response):
for article_url in response.css('.DataListContainer h3 a::attr(href)').extract():
yield scrapy.Request(response.urljoin(article_url), callback=self.parse_article)
page_number = response.meta['page_number'] + 1
if response.css('#MoreButton'):
yield scrapy.Request('{}/page{}'.format(self.base_url, page_number),
callback=self.parse, meta={'page_number': page_number})
def parse_article(self, response):
yield {
'text': ' '.join(response.xpath('//p[#id="ctl00_ctl00_ContentPlaceHolder_ContentPlaceHolder_NewsContent4_BodyLabel"]/text()').extract()),
'tags': [tag.strip() for tag in response.xpath('//div[#class="Tags"]/p/a/text()').extract() if tag.strip()]
}

Categories

Resources