Not being able to follow links in Scrapy - python

When I run my scrapy web crawler it is not following the pages to scrape the data in my code.
import scrapy
from ..items import YellowpagesItem
class YSpider(scrapy.Spider):
name = 'yp2'
allowed_domains = ['yellowpages.com']
start_urls = [
'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Conshohocken%2C+PA'
]
def parse(self, response):
for link in response.css('a.businesss-name'):
yield response.follow(link.attrib.get('href'), callback=self.parse_business)
def parse_business(self, response):
item = YellowpagesItem()
item['name'] = response.css('h1::text').get()
item['phone'] = response.css('p.phone::text').get()
item['street'] = response.css('h2 > span::text').get()
item['city_state'] = response.css('div.contact > h2.address::text').get()
item['tags'] = ','.join([item.get() for item in response.css('p.cats > a::text')])
item['email'] = response.css('a.email-business').attrib.get('href')
yield item

Simple typo on your selector, it should be a.business-name.
def parse(self, response):
for link in response.css('a.businesss-name'):
yield response.follow(link.attrib.get('href'), callback=self.parse_business)
If you do not know already, you can test your selector and follow function on Scrapy shell, this will avoid such situations, enter in your terminal:
scrapy shell 'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Conshohocken%2C+PA'
Then test the selectors like:
>> response.css('a.business-name')
<<
>> response.css('a.business-name')
<< [<Selector xpath="descendant-or-self:...>,...]
>> response.follow(response.css('a.business-name::attr(href)').get())
<< <GET https://www.yellowpages.com/conshohocken-pa/mip/tony-joes-pizzeria-10728468?lid=1002028703627>

Related

Can't scrape next page contents using Scrapy

I want to scrape the contents from the next pages too but it didn't go to the next page. My code is:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['startech.com.bd/component/processor']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
print ('\n')
print (name)
print (price)
print ('\n')
next_page_url = response.xpath('//*[#class="pagination"]/li/a/#href').extract_first()
# absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url)
I didn't use the urljoin because the next_page_url is giving me the whole url. I also tried the dont_filter=true argument in the yield function which gives me an infinite loop through the 1st page. The message I'm getting from the terminal is [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.startech.com.bd': https://www.startech.com.bd/component/processor?page=2>
This is because your allowed_domains variable is wrong, use allowed_domains = ['www.startech.com.bd'] instead (see the doc).
You can also modify your next page selector in order to avoid going to page one again:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['www.startech.com.bd']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
yield({'name': name, 'price': price})
next_page_url = response.css('.pagination li:last-child a::attr(href)').extract_first()
if next_page_url:
yield scrapy.Request(next_page_url)

whats wrong with this scrapy spider? scrapes only last url

In method parse() spider crawls 4 urls and then sends to method parse_dir_contents() to scrape some data but only 4th url is being scraped I don't understand why it is not scraping other 3 urls?
import scrapy
from v_one.items import VOneItem
import json
class linkedin(scrapy.Spider):
name = "linkedin"
allowed_domains = ["linkedin.com"]
start_urls = [
"https://in.linkedin.com/directory/people-s-1-2-4/",
]
def parse(self, response):
for href in response.xpath('//*[#id="seo-dir"]/div/div/div/ul/li/a/#href'):
url = response.urljoin(href.extract())
print "________________"+url
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//*[#id="profile"]'):
url = response.url
print "____________"+url
item = VOneItem()
item['name'] = sel.xpath('//*[#id="name"]/text()').extract()
item['headline'] = sel.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = sel.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = url
yield item
By inspecting the pages I think that there is no need of the for loop in the parse_dir_contents function. Make the function like this:
def parse_dir_contents(self, response):
item = VOneItem()
item['name'] = response.xpath('//*[#id="name"]/text()').extract()
item['headline'] = response.xpath('//*[#id="topcard"]/div/div/div/p/span/text()').extract()
item['current'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/span/text()').extract()
item['education'] = response.xpath('//*[#id="topcard"]/div/div/div/table/tbody/tr/td/ol/li/a/text()').extract()
item['link'] = response.url
return item
And check if this solves your issue.

Scrapy: Do not crawl links on other domains page

Below id my spider I created to get all links on NecToday.com for example.
import socket
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class PropertiesItem(scrapy.Item):
# Primary fields
title = scrapy.Field()
url = scrapy.Field()
class NecSpider(CrawlSpider):
name = "NecSpider"
#allowed_domains = ["nectoday.com"]
start_urls = ["http://nectoday.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
print(response.url)
item = PropertiesItem()
item["title"] = response.xpath("//title/text()").extract()
item["url"] = response.url
return(item)
This code starts to fetch all links present on site. Some of the pages have YouTube links as well. The problem is that once the first YouTube link is crawled, it starts to crawl other YouTube links referenced from the first YouTube link.
I want to crawl the first YouTube link, but no others. YouTube is just example. Tomorrow that can be another site as well. How can this be achieved?
Why not try something along the lines of this:
start_urls=["http://nectoday.com"]
def parse(self, response):
#parse whatever you need
for url in response.selector.xpath('//#href').extract():
if 'youtube.com' in url:
yield scrapy.Request(url, callback=self.parse_no_follow)
else:
yield scrapy.Request(url, callback=self.parse)
def parse_no_follow(self, response):
#parse whatever you want and not follow anymore links
This will only be scraping from your allowed domain.
class QuotesSpider(CrawlSpider):
name = "your app name"
n=0
allowed_domains = ['domain']
start_urls=['anywebpage']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
QuotesSpider.n=QuotesSpider.n+1
if (len(response.body)>100):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 0
dd = response.body.decode("utf-8")
init=dd.find("<p>")
while init>0:
end = dd.find("</p>", init)
if end>0:
o=h.handle(dd[init:end+4]+"\n")
supersentences=o.split('\n')

Scrapy crawler will not crawl any webpages

I have been trying to get this crawler working but I keep getting errors.
Can anyone suggest any ways to get it to run?
The main spider code is
import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
class gameSpider(scrapy.Spider):
name = "game_spider.py"
allowed_domains = ["*"]
start_urls = [
"http://www.game.co.uk/en/grand-theft-auto-v-with-gta-online-3-500-000-1085837?categoryIdentifier=706209&catGroupId="
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//ul[#class="directory-url"]/li')
items = []
for site in sites:
item = Website()
item['name'] = site.xpath('//*[#id="details301149"]/div/div/h2/text()').extract()
"""item['link'] = site.xpath('//a/#href').extract()
item['description'] = site.xpath('//*[#id="overview"]/div[3]()').re('-\s[^\n]*\\r')"""
items.append(item)
print items
return items
The item code is
import scrapy
class GameItem(Item):
name = Field()
pass
Your start_urls link returns erorr 500.
There's no items.
In [7]: sites = response.xpath('//ul[#class="directory-url"]/li')
In [8]: sites
Out[8]: []

Scrapy Spider just crawls and does not scrape

I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.

Categories

Resources