whats wrong with this scrapy spider? scrapes only last url

whats wrong with this scrapy spider? scrapes only last url - python

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item

By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.

Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

how to scrape the URL on Scrapy Following Links

I am confused how to scrape the URL itself in following links scrapy.
I do crawling on this page here
import scrapy
from ..items import SkripsiItem
class SkripsiSpiderSpider(scrapy.Spider):
name = 'skripsi'
start_urls = ['https://nasional.sindonews.com/topic/9695/pemilu-2019/']
def parse(self, response):
for href in response.css('.lnk-t a::attr(href)'):
yield response.follow(href, self.parse_author)
for href in response.css('.newpaging li:nth-child(4) a::attr(href)'):
yield response.follow(href, self.parse)
def parse_author(self, response):
items = SkripsiItem()
def extract_with_css(query):
return response.css(query).get(default='').strip()
content = response.xpath(".//div[#class='vidy-embed']/descendant::text()").extract()
items['title'] = extract_with_css('h1::text'),
items['author'] = extract_with_css('.author a::text'),
items['time'] = extract_with_css('time::text'),
items['imagelink'] = extract_with_css('.article img::attr(src)'),
items['content'] = ''.join(content),
yield items
how to scrape every url that is visited at the following link, which is in the code above is .lnk -t a :: attr (href)

Save items['url'] = response.url in the parse_author function.

What are the best practices for calling an external api?

So let's say I want to write a spider that using the Facebook API to calculate the likes on every page of a website. If I import the requests library, I'm able to call the Facebook graph API as follows.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
data=requests.get(base)
return self.parse_likes(data)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
item['url'] = response.url
links = response.css('a::attr(href)').extract()
item['fb_url'],item['shares'],item['comments'] = self.get_likes(response.url)
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
However, I can't seem to get this code to work if, rather than using the requests, I use the scrapy.Request call. Something like this.
import scrapy
import json
import requests
API_KEY="KEY_GOES_HERE"
class WebSite(scrapy.Spider):
name = "website_page"
allowed_domains = ["website.com"]
start_urls = ['https://website.com/']
def get_likes(self,url):
base='https://graph.facebook.com/{}?access_token={}'.format(url,API_KEY)
return scrapy.Request(base,callback=self.parse_likes)
def parse_likes(self, data):
data = json.loads(data.text)
return data['id'],data['share']['comment_count'],data['share']['share_count']
def parse(self, response):
item= {}
links = response.css('a::attr(href)').extract()
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
for link in links:
link = response.urljoin(link)
item['link'] = link
yield scrapy.Request(link, callback=self.parse)
yield item
In this case, I just get a blank response for the Facebook data. I think i'm missing some understanding about how the scrapy.Request method works relative to the standard requests library. Any ideas?

This is a very common case: How to yield from item from multiple urls?
And the most common solution is to chain requests by carrying your item in request.meta paramater.
For your example implementation with this logic could look like:
class WebSite(scrapy.Spider):
base='https://graph.facebook.com/{}?access_token={}'.format
api_key = '1234'
def parse(self, response):
links = response.css('a::attr(href)').extract()
for link in links:
item= {}
item['url'] = response.url
item['fb_data']=self.get_likes(response.url).body
item['link'] = response.urljoin(link)
api_url = self.base(self.api_key, link)
yield scrapy.Request(api_url,
callback=self.parse_likes,
meta={'item': item})
def parse_likes(self, response):
item = response.meta['item']
data = json.loads(data.text)
share_count = data['id'],data['share']['comment_count'],data['share']['share_count']
item['share_count'] = share_count
yield item

How to get data from subsequent urls with scrappy

I want to fetch data from this website: http://www.go-on.fi/tyopaikat with Scrappy.
I am able to fetch the data from this page which have href move to another page when you click on the first column title.
My question is how can I lead the spider to go deeper to for example this link: http://www.go-on.fi/tyopaikat/oulu/11414 and get information from there.
so this is my code at this moment:
class JobDataSpider(CrawlSpider):
name = "jobdata"
allowed_domains = ["go-on.fi"]
start_urls = ["http://www.go-on.fi/tyopaikat?start=0",
"http://www.go-on.fi/tyopaikat?start=20",
"http://www.go-on.fi/tyopaikat?start=40",
"http://www.go-on.fi/tyopaikat?start=60"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
jobs = hxs.select("//tr")
for row in jobs:
item = JobData()
item['title'] = row.select("./td[1]/a/text()").extract()
item['link'] = row.select("./td[1]/a/#href").extract()
item['location'] = row.select("./td[2]/text()").extract()

You need to yield Request(url) for each of the URl's found on the first page. Something like this should work:
class JobDataSpider(CrawlSpider):
name = "jobdata"
allowed_domains = ["go-on.fi"]
start_urls = ["http://www.go-on.fi/tyopaikat?start=0",
"http://www.go-on.fi/tyopaikat?start=20",
"http://www.go-on.fi/tyopaikat?start=40",
"http://www.go-on.fi/tyopaikat?start=60"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.xpath("//tr/td[1]/a/#href").extract()
for l in links:
yield Request(l, callback=self.parse_pages)
def parse_pages(self, response):
hxs = HtmlXPathSelector(response)
item = JobData()
item['link'] = response.url
etc....

Scrapy Spider just crawls and does not scrape

I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.

What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

whats wrong with this scrapy spider? scrapes only last url - python

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

how to scrape the URL on Scrapy Following Links

What are the best practices for calling an external api?

How to get data from subsequent urls with scrappy

Scrapy Spider just crawls and does not scrape

Categories

Resources