I'm new to Python and Scrapy. I'm trying to create a spider to scrape: https://www.festicket.com/festivals/
I've managed to get the spider working, the problem is that some URLs are like so:
https://www.festicket.com/festivals/electric-daisy-carnival-edc-las-vegas/2018/
and some URLS have: /shop/#ticket appended to them which is stoping the spider from crawling the listing page.
My question is, is there some way that if the spider finds a URL with /shop/#ticket it simple deletes the /shop/#ticket but keeps the rest of the URL???
My code so far is below:
import scrapy
class AuthorsSpider(scrapy.Spider):
name = "festicket"
start_urls = ['https://www.festicket.com/festivals/']
npages = 20
# This mimics getting the pages using the next button.
for i in range(2, npages + 2):
start_urls.append("https://www.festicket.com/festivals/?page=" + str(i) + "")
#Scrape and follow listings
def parse(self, response):
urls = response.xpath(
"//h3[#class='festival-title heading-3ry notranslate']//#href").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
def parse_details(self, response):
yield {
'title': response.xpath("//h1[#class='sc-jzJRlG gbLQoU']/text()").extract_first(),
'festival_url': response.xpath("//meta[#property='og:url']/#content").extract_first(),
'location': response.xpath("//ul[contains(#class,'styles__StyledList')][1]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
'address': response.xpath("//div[#class='sc-gzVnrw bpJeJY'][2]/section[#class='sc-gZMcBi gDrvBk']/div/p[#class='sc-chPdSV hifsJb']/descendant::text()").extract_first(),
'date': response.xpath("//ul[contains(#class,'styles__StyledList')][1]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
'genre1': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][1]/descendant::text()").extract_first(),
'genre2': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][2]/descendant::text()").extract_first(),
'genre3': response.xpath("//ul[contains(#class,'styles__StyledList')][2]/li[contains(#class,'styles__DotSeparatorSpan-h0jg7b')][3]/descendant::text()").extract_first(),
'subtitle2': response.xpath( "//span[#class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
'subtitle1': response.xpath("//h2[#class='sc-cSHVUG gCeeYI']/descendant::text()").extract_first(),
'para1': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[1]/descendant::text()").extract_first(),
'para2': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[2]/descendant::text()").extract_first(),
'para3': response.xpath("//span[#class='styles__StyledHtmlWrapper-s1eywhsl-0 cJBjEA sc-jAaTju jlDUtI']/p[3]/descendant::text()").extract_first(),
'flyer': response.xpath("//img[contains(#class,'styles__Artwork')]/#src").extract_first(),
'subtitle2': response.xpath("//span[#class='styles__StyledHtmlWrapper-l0qhyk-0 cUaVYv sc-jAaTju jlDUtI']/p/descendant::text()").extract_first(),
'banner_image_1': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][1]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
'banner_image_2': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][2]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
'banner_image_3': response.xpath("//div[#class='styles__PhotoWrapper-s1brd5dy-2 cpnBtx'][3]/div[#class='styles__PhotoInnerWrapper-s1brd5dy-3 gVsbNY']/img[#class='styles__PhotoImage-s1brd5dy-4 cqQHmb']/#src").extract_first(),
}
You need to change this part:
for url in urls:
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
to:
for url in urls:
if "/shop/#ticket" in url:
next
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
UPDATE
If you want to replace "/shop/#ticket" at the end of an URL:
for url in urls:
url = re.sub( r'/shop/#ticket$', "", url )
url = response.urljoin(url)
yield scrapy.Request(url=url, callback=self.parse_details)
Related
In spider, I just want to request URLs that have one rule.
URLs list :
www.example.com/bread/coffee/A
www.example.com/bread/coffee/B
www.example.com/bread/coffee/C
www.example.com/bread/coffee/D
so start_request is www.example.com/bread/coffee/A
and then what gonna have to do in def parse ??
class MySpider(scrapy.Spider):
name = 'exmple.com'
start_urls = [www.example.com/bread/coffee/A]
def parse(self, response):
???
yield ???
a little hint will appreciate
you can use code like this:
class MySpider(scrapy.Spider):
name = 'exmple.com'
start_urls = ['www.example.com/bread/coffee/A']
def start_requests(self):
urls = [
'www.example.com/bread/coffee/A',
'www.example.com/bread/coffee/B',
'www.example.com/bread/coffee/C',
'www.example.com/bread/coffee/D'
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# find what you need
yield # your item/dict
also, make sure your url(s) has correct format, they should contains http or https
aslo you can use string module to generate your urls
import string
def start_requests(self):
url = 'www.example.com/bread/coffee/{}'
for l in string.ascii_uppercase:
url = url.format(l)
yield scrapy.Request(url=url, callback=self.parse)
I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)
Save items['url'] = response.url in the parse_author function.
I am trying to make a web scraper but I'm unable to get the link of the next page. I have tried some combinations but none of them work. The tutorial on scrapy.org has a simpler format so it doesn't solve my problem
The site I'm scraping has the following layout:
<nav class="nav_class">
<a class="class_1" href="1.html">
<a class="class_2" href="2.html">
<a class="class_3" href="3.html">
I want to get the 3.html link using css selectors
import scrapy
class MySpider(scrapy.Spider):
name = "flip_spider"
def start_requests(self):
urls = [
"https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=1",
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# page_id=response.url.split("=")[-1]
phone_details = response.css("div._1-2Iqu.row")
for ph in phone_details:
phone = ph.css("div._3wU53n::text").get()
rating = ph.css("div.hGSR34::text").get()
price = ph.css("div._1vC4OE._2rQ-NK::text").get()
yield{
"name": phone,
"rating": rating,
"price": price,
}
final = "https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=6"
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
# ^This is the line I need help with
if next_page_id is not final:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)
It only scrapes the first page and then stops
Change you code to this and it will work
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
if next_page_id:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)
I want to scrape every next page. I've found a way to do it with scrapy shell but I don't know if my spider will iterate through every page or just the next one; I'm not too sure how to implement that.
alphabet = string.ascii_uppercase
each_link = '.' + alphabet
each_url = ["https://myanimelist.net/anime.php?letter={0}".format(i) for i in each_link]
#sub_page_of_url = [[str(url)+"&show{0}".format(i) for i in range(50, 2000, 50)] for url in each_url] #start/stop/steps
#full_url = each_url + sub_page_of_url
class AnimeScraper_Spider(scrapy.Spider):
name = "Anime"
def start_requests(self):
for url in each_url:
yield scrapy.Request(url=url, callback= self.parse)
def parse(self, response):
next_page_url = response.xpath(
"//div[#class='bgColor1']//a[text()='Next']/#href").extract_first()
for href in response.css('#content > div.normal_header.clearfix.pt16 > div > div > span > a:nth-child(1)') :
url = response.urljoin(href.extract())
yield Request(url, callback = self.parse_anime)
yield Request(next_page_url, callback=self.parse)
def parse_anime(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
return {
"title" : tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis" : tr_sel.css("div.pt4::text").extract_first(),
"type_" : tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes" : tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating" : tr_sel.css('td:nth-child(5)::text').extract_first().strip()
}
I think that you're trying something too complicated, it should be as simple as:
Start from the main page
Identify all the pages that start with a particular letter
For each of these pages, take all the next links and repeat
It looks something like that:
import string
import scrapy
from scrapy import Request
class AnimeSpider(scrapy.Spider):
name = "Anime"
start_urls = ['https://myanimelist.net/anime.php']
def parse(self, response):
xp = "//div[#id='horiznav_nav']//li/a/#href"
return (Request(url, callback=self.parse_anime_list_page) for url in response.xpath(xp).extract())
def parse_anime_list_page(self, response):
for tr_sel in response.css('div.js-categories-seasonal tr ~ tr'):
yield {
"title": tr_sel.css('a[id] strong::text').extract_first().strip(),
"synopsis": tr_sel.css("div.pt4::text").extract_first(),
"type_": tr_sel.css('td:nth-child(3)::text').extract_first().strip(),
"episodes": tr_sel.css('td:nth-child(4)::text').extract_first().strip(),
"rating": tr_sel.css('td:nth-child(5)::text').extract_first().strip(),
}
next_urls = response.xpath("//div[#class='spaceit']//a/#href").extract()
for next_url in next_urls:
yield Request(response.urljoin(next_url), callback=self.parse_anime_list_page)
In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.