Scrapy _ How to append / delete text to listing URL

Scrapy _ How to append / delete text to listing URL - python

I'm new to Python and Scrapy. I'm trying to create a spider to scrape: https://www.festicket.com/festivals/
I've managed to get the spider working, the problem is that some URLs are like so:
https://www.festicket.com/festivals/electric-daisy-carnival-edc-las-vegas/2018/
and some URLS have: /shop/#ticket appended to them which is stoping the spider from crawling the listing page.
My question is, is there some way that if the spider finds a URL with /shop/#ticket it simple deletes the /shop/#ticket but keeps the rest of the URL???
My code so far is below:
import scrapy
class AuthorsSpider(scrapy.Spider):
name = "festicket"
start_urls = ['https://www.festicket.com/festivals/']
npages = 20
# This mimics getting the pages using the next button.
for i in range(2, npages + 2):
start_urls.append("https://www.festicket.com/festivals/?page=" + str(i) + "")
#Scrape and follow listings
def parse(self, response):
urls = response.xpath(
"//h3[#class='festival-title heading-3ry notranslate']//#href").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
def parse_details(self, response):
yield {
'title': response.xpath("//h1[#class='sc-jzJRlG gbLQoU']/text()").extract_first(),
'festival_url': response.xpath("//meta[#property='og:url']/#content").extract_first(),
'location': response.xpath("//ul[contains(#class,'styles__StyledList')][1]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
'address': response.xpath("//div[#class='sc-gzVnrw bpJeJY'][2]/section[#class='sc-gZMcBi gDrvBk']/div/p[#class='sc-chPdSV hifsJb']/descendant::text()").extract_first(),
'date': response.xpath("//ul[contains(#class,'styles__StyledList')][1]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
'genre1': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
'genre2': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
'genre3': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][3]/descendant::text()").extract_first(),
'subtitle2': response.xpath( "//span[#class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
'subtitle1': response.xpath("//h2[#class='sc-cSHVUG gCeeYI']/descendant::text()").extract_first(),
'para1': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[1]/descendant::text()").extract_first(),
'para2': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[2]/descendant::text()").extract_first(),
'para3': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[3]/descendant::text()").extract_first(),
'flyer': response.xpath("//img[contains(#class,'styles__Artwork')]/#src").extract_first(),
'subtitle2': response.xpath("//span[#class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
'banner_image_1': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][1]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
'banner_image_2': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][2]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
'banner_image_3': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][3]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
}

You need to change this part:
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
to:
for url in urls:
if "/shop/#ticket" in url:
next
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
UPDATE
If you want to replace "/shop/#ticket" at the end of an URL:
for url in urls:
url = re.sub( r'/shop/#ticket$', "", url )
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)

Related

How could I request urls just changes last part of url

In spider, I just want to request URLs that have one rule.
URLs list :
www.example.com/bread/coffee/A
www.example.com/bread/coffee/B
www.example.com/bread/coffee/C
www.example.com/bread/coffee/D
so start_request is www.example.com/bread/coffee/A
and then what gonna have to do in def parse ??
class MySpider(scrapy.Spider):
name = 'exmple.com'
start_urls = [www.example.com/bread/coffee/A]
def parse(self, response):
???
yield ???
a little hint will appreciate

you can use code like this:
class MySpider(scrapy.Spider):
name = 'exmple.com'
start_urls = ['www.example.com/bread/coffee/A']
def start_requests(self):
urls = [
'www.example.com/bread/coffee/A',
'www.example.com/bread/coffee/B',
'www.example.com/bread/coffee/C',
'www.example.com/bread/coffee/D'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# find what you need
yield # your item/dict
also, make sure your url(s) has correct format, they should contains http or https
aslo you can use string module to generate your urls
import string
def start_requests(self):
url = 'www.example.com/bread/coffee/{}'
for l in string.ascii_uppercase:
url = url.format(l)
yield scrapy.Request(url=url, callback=self.parse)

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)

Save items['url'] = response.url in the parse_author function.

Getting to the next page using scrapy

I am trying to make a web scraper but I'm unable to get the link of the next page. I have tried some combinations but none of them work. The tutorial on scrapy.org has a simpler format so it doesn't solve my problem
The site I'm scraping has the following layout:
<nav class="nav_class">
<a class="class_1" href="1.html">
<a class="class_2" href="2.html">
<a class="class_3" href="3.html">
I want to get the 3.html link using css selectors
import scrapy
class MySpider(scrapy.Spider):
name = "flip_spider"
def start_requests(self):
urls = [
"https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=1",
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# page_id=response.url.split("=")[-1]
phone_details = response.css("div._1-2Iqu.row")
for ph in phone_details:
phone = ph.css("div._3wU53n::text").get()
rating = ph.css("div.hGSR34::text").get()
price = ph.css("div._1vC4OE._2rQ-NK::text").get()
yield{
"name": phone,
"rating": rating,
"price": price,
}
final = "https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=6"
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
# ^This is the line I need help with
if next_page_id is not final:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)
It only scrapes the first page and then stops

Change you code to this and it will work
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
if next_page_id:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)

Scrapy multiple next page

I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}

I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item

By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy _ How to append / delete text to listing URL - python

Related

How could I request urls just changes last part of url

how to scrape the URL on Scrapy Following Links

Getting to the next page using scrapy

Scrapy multiple next page

whats wrong with this scrapy spider? scrapes only last url

Categories

Resources