how to scrape the URL on Scrapy Following Links - python

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)

Save items['url'] = response.url in the parse_author function.

Related

Scraping all pages on quote.toscrape with scrapy

I'm trying to scrape some information on the website : http://quotes.toscrape.com/
But I cannot find a way to scrape all the pages, I just had the first pages for now.
Here's my script so far :
import scrapy
from ..items import QuotetutorialItem
class QuoteSpider(scrapy.Spider):
name = 'quotes'
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
items = QuotetutorialItem()
all_div_quotes = response.css('div.quote')
for quotes in all_div_quotes:
title = quotes.css('span.text::text').extract()
author = quotes.css('.author::text').extract()
tags = quotes.css('.tag::text').extract()
items['title'] = title
items['author'] = author
items['tags'] = tags
yield items
next_page = response.xpath('//*[#class="next"]/a/#href').extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
I have also tried this :
next_page = response.xpath('//*[#class="next"]/a/#href').get()
absolute_next_page_url = response.urljoin(next_page)
if absolute_next_page_url is not None:
yield scrapy.Request(absolute_next_page_url)
And this :
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
But none of these solutions seems to work.
ANy ideas ? :)
Thanks !

Getting to the next page using scrapy

I am trying to make a web scraper but I'm unable to get the link of the next page. I have tried some combinations but none of them work. The tutorial on scrapy.org has a simpler format so it doesn't solve my problem
The site I'm scraping has the following layout:
<nav class="nav_class">
<a class="class_1" href="1.html">
<a class="class_2" href="2.html">
<a class="class_3" href="3.html">
I want to get the 3.html link using css selectors
import scrapy
class MySpider(scrapy.Spider):
name = "flip_spider"
def start_requests(self):
urls = [
"https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=1",
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# page_id=response.url.split("=")[-1]
phone_details = response.css("div._1-2Iqu.row")
for ph in phone_details:
phone = ph.css("div._3wU53n::text").get()
rating = ph.css("div.hGSR34::text").get()
price = ph.css("div._1vC4OE._2rQ-NK::text").get()
yield{
"name": phone,
"rating": rating,
"price": price,
}
final = "https://www.flipkart.com/mobiles/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.serviceability%5B%5D%3Dfalse&p%5B%5D=facets.offer_type%255B%255D%3DExchange%2BOffer&otracker=clp_banner_1_10.bannerX3.BANNER_mobile-phones-store_HPUGCU9BYBF6&fm=neo%2Fmerchandising&iid=M_934db066-154e-4074-a4b1-96f56a0af28e_6.HPUGCU9BYBF6&ppt=HomePage&ppn=Home&ssid=85m4yqvgzk0000001558978084715&page=6"
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
# ^This is the line I need help with
if next_page_id is not final:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)
It only scrapes the first page and then stops
Change you code to this and it will work
next_page_id = response.css("nav._1ypTlJ a._3fVaIS::attr(href)").get()
if next_page_id:
next_page = response.urljoin(next_page_id)
yield scrapy.Request(next_page, callback=self.parse)

Python Scrapy scrape data from nested pages

i have a made a scraper that scrapes data from a website that have the data nested, i mean that to get to the data page i have to click 5 links then i get to the data page where i scrape the data
For every 1st page there are multiple page 2's for every page 2's there are many page 3's and so on
so here i have a parse function for opening each page until i get to the page that has the data and add the data to the item class ad return the item.
But it is skipping a lot of links without scraping data. It is not executing the last parse_link function after 100 or so links*. Well how do i know the parse_link function is not executing ?
it is because i am printing print '\n\n', 'I AM EXECUTED !!!!' and it is not printing after 100 or so links but the code executes parse_then every time
what i want to know is am i doing it right ? is this the right aproch to scrape a website like this
here is the code
# -*- coding: utf-8 -*-
import scrapy
from urlparse import urljoin
from nothing.items import NothingItem
class Canana411Spider(scrapy.Spider):
name = "canana411"
allowed_domains = ["www.canada411.ca"]
start_urls = ['http://www.canada411.ca/']
PAGE 1
def parse(self, response):
SET_SELECTOR = '.c411AlphaLinks.c411NoPrint ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_next)
PAGE 2
def parse_next(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_more)
PAGE 3
def parse_more(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_other)
PAGE 4
def parse_other(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_then)
PAGE 5
def parse_then(self, response):
SET_SELECTOR = '.c411Cities li h3 a ::attr(href)'
link = response.css(SET_SELECTOR).extract_first()
link = urljoin(response.url, link)
return scrapy.Request(link, callback=self.parse_link)
PAGE 6 THE DATA PAGE
def parse_link(self, response):
print '\n\n', 'I AM EXECUTED !!!!'
item = NothingItem()
namese = '.vcard__name ::text'
addressse = '.c411Address.vcard__address ::text'
phse = 'span.vcard__label ::text'
item['name'] = response.css(namese).extract_first()
item['address'] = response.css(addressse).extract_first()
item['phone'] = response.css(phse).extract_first()
return item
am i doing it right, or is there is a better way that i am missing ?
If there's no conflict (e.g. 1st page cannot contain selectors and links to 3rd and should take into consideration from any page except 2nd or something alike) I'd recommend to flatten rules to extract links. Thus one parse would be enough.

What are the best practices for calling an external api?

So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?
This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

Categories

Resources