I have this code
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="headline_area"]')
items = []
for site in sites[:5]:
item = StackItem()
log.msg(' LOOP' +str(ivar)+ '', level=log.ERROR)
item['title'] ="yoo ma"
request = Request("blabla", callback=self.test1)
request.meta['item'] = item
page_number = nextlink.split("&")[-1].split("=")[-1]
if int(page_number) > 500:
raise CloseSpider('Search Exceeded 500')
ivar = ivar + 1
yield request
mylinks= soup.find_all('a')
if mylinks:
nextlink = mylinks[0].get('href')
page_number = nextlink.split("&")[-3].split("=")[-1]
request = Request(urljoin(response.url, nextlink), callback=self.parse)
request.meta['page'] = page_number
yield request
Now my problem is that suppose i want to stop at page_number = 5
now scrappy goes to that page before the all items from page 1 , page 2 etc are downloaded and stops when it first reaches there.
How can get rid of that porblem that it prcess all links before going to page = 5
Does the link has some regularity on different page? For example, if the 5th page's link is www.xxxx.net/nForum/#!article/Bet/447540?p=5. You can scrappy link with p=5 directly.
You can use the inline_requests decorator.
Related
So I'm relatively new to scrapy and am trying to get a crawler that pulls hyper links for businesses on a listing page. Here is the code:
class EmailSpider(CrawlSpider):
name = "emailcrawler"
start_urls = [
'https://www.yellowpages.com/search?search_terms=Computer+Software+%26+Services&geo_location_terms=Florence%2C+KY'
# 'https://www.yellowpages.com/search?search_terms=Computers+%26+Computer+Equipment-Service+%26+fix&geo_location_terms=FL'
]
def parse(self, response):
information = response.xpath('//*[#class="info"]')
for info in information:
website = info.xpath('.//*[#class="links"]/a/#href').extract_first()
if website != "None":
request = Request(url = website, callback=self.parse_email, errback = self.handle_error,
meta={'dont_retry': True, 'dont_redirect':True, 'handle_httpstatus_list': [302]})
request.meta['data'] = {
'Website': website
}
# yield response.follow(url = website, callback = self.parse_email)
yield request
next_page_url = response.xpath('//*[#class="next ajax-page"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url, errback = self.handle_error, meta={'dont_retry': True, 'dont_redirect':True})
def parse_email(self, response):
data = response.meta.get('data')
# try:
# emails = set(re.findall(r"[a-z0-9\.\-+_]+#[a-z0-9\.\-+_]+\.com", response.text, re.I))
# except AttributeError:
# return
# data['email'] = emails
selector = Selector(response)
for found_address in selector.re('[a-zA-Z0-9._%+-]+#[a-zA-Z0-9.-]+\.com'):
# item = EmailAddressItem()
data['email_address'] = found_address
# item['url'] = response.url
yield data
def handle_error(self, failure):
self.log("Request failed: %s" % failure.request)
Before I attempted to get scrapy to follow each link, I had it just return the list of websites that it pulled which worked perfectly. It was able to request the next page after iterating through the urls on the page and then yield the results. What I am trying to do now is to get it to go to each website that it pulls, extract an email element on that website if it is found and then return back to the loop and then try another website. The problem is that when the crawler gets a response error the crawl just stops. It also seems like even if the Request was successful, that the crawler is not going to be able to return to the original iteration through the yellowpages url. It gets stuck in one of the websites that it follows and then the for loop dies. How can I get the crawler to stay its course and keep attempting to pull from the websites it scrapes while also staying within the process of iterating through each page of the listing website. To put it simply, I need to be able to go through every single page on the initial listing page no matter what request error comes about, but have the crawler pop in and out of the websites it finds and attempt to scrape data on those sites.
class EmailSpider(CrawlSpider):
name = "followwebsite"
start_urls = [
# 'https://www.manta.com/mb_35_D000B000_000/offices_and_clinics_of_medical_doctors',
# 'https://www.chess.com/home'
# 'https://webscraper.io/test-sites/e-commerce/static'
'https://www.yellowpages.com/search?search_terms=Computer+Software+%26+Services&geo_location_terms=Florence%2C+KY'
'https://www.yellowpages.com/search?search_terms=Computers+%26+Computer+Equipment-Service+%26+fix&geo_location_terms=FL'
]
def parse(self, response):
website = response.xpath('//*[#class="links"]/a/#href')
yield from response.follow_all(website, self.parse_email)
next_page_url = response.xpath('//*[#class="next ajax-page"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url, errback = self.handle_error)
def parse_email(self, response):
selector = Selector(response)
for found_address in selector.re('[a-zA-Z0-9._%+-]+#[a-zA-Z0-9.-]+\.com'):
item = EmailAddressItem()
item['email_address'] = found_address
# item['url'] = response.url
yield item
def handle_error(self, failure):
self.log("Request failed: %s" % failure.request)
Figured it out no thanks to you bums
I want to crawl news site using Scrapy. The code retrieved related news from current link but not following the next page links. The news site has following link property
The code I am following :
import scrapy
class fakenews(scrapy.Spider):
name = "bb8"
allowed_domains = ["snopes.com"]
start_urls = [
"https://www.snopes.com/fact-check/category/science/"
]
custom_settings = {'FEED_URI': "fakenews_%(time)s.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
name1 = input(" Please enter input : ")
name1 = name1.lower()
links =response.xpath("//div[#class='media-list']/article/a/#href").extract()
headers = response.xpath('//div[#class="media-body"]/h5/text()').extract()
headers1 = [c.strip().lower() for c in headers]
raw_data=zip(headers1,links)
for header, link in raw_data:
p = header
l=link
if name1 in p:
scrap_info3 = {'page': response.url, 'title': header, 'link':l}
yield scrap_info3
next_page = response.css("//a[#class='btn-next btn']/#href").get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
Though from current page it return information but also showing error.
For input I entered: NASA
The main error is that you have css function and xpath selector for next_page:
next_page = response.css("//a[#class='btn-next btn']/#href").get()
The next problem is that you have yielding request of next page inside for cycle. This will lead to calling a lot of duplicate request.
So I suppose these changes:
def parse(self, response):
name1 = input(" Please enter input : ")
name1 = name1.lower()
links = response.xpath("//div[#class='media-list']/article/a/#href").extract()
headers = response.xpath('//div[#class="media-body"]/h5/text()').extract()
headers1 = [c.strip().lower() for c in headers]
# my changes since this moment:
raw_data = zip(headers1, links)
# use less variables in loop (yes, just cosmetic, but your code will more readable)
for header, link in raw_data:
if name1 in header:
yield {'page': response.url, 'title': header, 'link': link}
# use proper selector here
next_page = response.css("a.btn-next::attr(href)").get()
# move all this block out of for loop
if next_page:
yield response.follow(next_page)
Let's imagine I have a webpage like this.
counter.php
if(isset($_GET['count'])){
$count = intval($_GET['count']);
$previous = $count - 1;
$next = $count + 1;
?>
< Previous
Current: <?php echo $count;?>
Next >
<?
}
?>
This is an "infinite" website because you can just keep clicking next to go to the next page (the counter will just increase) or previous etc.
However, if I wanted to crawl this page and follow the links using scrapy like this, scrapy will never stop crawling.
Example spider:
urls = []
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://example.com/counter?count=1']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
urls.append(response.url)
What kind of mechanism can I use to determine if indeed I am stuck in an infinite website and need to break out of it?
You can always break out if the page does not have ITEMS on that page, or do not have NEXT PAGE button, that means pagination has ended
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
def start_requests(self):
page = 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
def parse_item(self, response):
#METHOD 1: check if items availble on this page
items = response.css("li.items")
if items:
#Now go to next page
page = int(response.meta['page']) + 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
else:
logging.info("%s was last page" % response.url)
#METHOD 2: check if this page has NEXT PAGE button, most websites has that
nextPage = response.css("a.nextpage")
if nextPage:
#Now go to next page
page = int(response.meta['page']) + 1
yield Request("http://example.com/counter?page=%s" % (page), meta={"page": page}, callback=self.parse_item)
else:
logging.info("%s was last page" % response.url)
You don't have to use Rule in the scrapy. You can first parse the page by page and then iterates all items in each page. Or you can collect all item links in the each page.
For example:
urls = []
class TestSpider(CrawlSpider):
name = 'test'
allowed_domains = ['example.com']
start_urls = ['http://example.com/counter?count=1']
def parse(self, response):
links = response.xpath('//a[#class="item"]/#href').extract()
for link in links:
yield Request(link, self.parse_item)
# you can insert the item 's url here, so you dont have to yield to parse_item
# urls.append(link)
url, pg = response.url.split("=")# you can break infinite loop here
if int(pg) <= 10: #We loop by page #10
yield Request(url + "=" + str(int(pg) + 1), self.parse)
def parse_item(self, response):
urls.append(response.url)
My spider function is on a page and I need to go to a link and get some data from that page to add to my item but I need to go to various pages from the parent page without creating more items. How would I go about doing that because from what I can read in the documentation I can only go in a linear fashion:
parent page > next page > next page
But I need to:
parent page > next page
> next page
> next page
You should return Request instances and pass item around in meta. And you would have to make it in a linear fashion and build a chain of requests and callbacks. In order to achieve it, you can pass around a list of requests for completing an item and return an item from the last callback:
def parse_main_page(self, response):
item = MyItem()
item['main_url'] = response.url
url1 = response.xpath('//a[#class="link1"]/#href').extract()[0]
request1 = scrapy.Request(url1, callback=self.parse_page1)
url2 = response.xpath('//a[#class="link2"]/#href').extract()[0]
request2 = scrapy.Request(url2, callback=self.parse_page2)
url3 = response.xpath('//a[#class="link3"]/#href').extract()[0]
request3 = scrapy.Request(url3, callback=self.parse_page3)
request.meta['item'] = item
request.meta['requests'] = [request2, request3]
return request1
def parse_page1(self, response):
item = response.meta['item']
item['data1'] = response.xpath('//div[#class="data1"]/text()').extract()[0]
return request.meta['requests'].pop(0)
def parse_page2(self, response):
item = response.meta['item']
item['data2'] = response.xpath('//div[#class="data2"]/text()').extract()[0]
return request.meta['requests'].pop(0)
def parse_page3(self, response):
item = response.meta['item']
item['data3'] = response.xpath('//div[#class="data3"]/text()').extract()[0]
return item
Also see:
How can i use multiple requests and pass items in between them in scrapy python
Almost Asynchronous Requests for Single Item Processing in Scrapy
Using the Scrapy Requests you can perform extra operations on the next URL in the scrapy.Request's callback .
I am making a project in which I have used scrapy to scrape items from web sites, but the problem is, the xpaths of the 1st 2 pages of that site is different from the xpaths of the other pages.
As the result my spider just scrapes the items from first two pages and just simply crawls over the other pages.
How can I make my spider also scrape the items of the pages too??
I am also including my spider here so that u can see through my spider if needed.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from project2.items import Project2Item
from scrapy.http import Request
class ProjectSpider(BaseSpider):
name = "project2spider"
allowed_domains = ["http://directory.thesun.co.uk/"]
current_page_no = 1
start_urls = [
'http://directory.thesun.co.uk/find/uk/computer-repair'
]
def get_next_url(self, fired_url):
if '/page/' in fired_url:
url, page_no = fired_url.rsplit('/page/', 1)
else:
if self.current_page_no != 1:
#end of scroll
return
self.current_page_no += 1
return "http://directory.thesun.co.uk/find/uk/computer-repair/page/%s" % self.current_page_no
# the parse procedure, and here is the codes which declares which field to scrape.
def parse(self, response):
fired_url = response.url
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="abTbl "]')
for site in sites:
item = Project2Item()
item['Catogory'] = site.select('span[#class="icListBusType"]/text()').extract()
item['Bussiness_name'] = site.select('a/#title').extract()
item['Description'] = site.select('span[last()]/text()').extract()
item['Number'] = site.select('span[#class="searchInfoLabel"]/span/#id').extract()
item['Web_url'] = site.select('span[#class="searchInfoLabel"]/a/#href').extract()
item['adress_name'] = site.select('span[#class="searchInfoLabel"]/span/text()').extract()
item['Photo_name'] = site.select('img/#alt').extract()
item['Photo_path'] = site.select('img/#src').extract()
#items.append(item)
yield item
next_url = self.get_next_url(fired_url)
if next_url:
yield Request(next_url, self.parse, dont_filter=True)
for other pages I need to use this: sites = hxs.select('//div[#class="icListItem"]')
How can I include this in my spider so that it can scrape items form other pages too..
At present its just scraping 1st two pages and simply crawls over other pages.
What did you try so far?
One solution would be using an index-like parameter passed as a meta data when calling for the next page. Something like:
def parse(self, response):
hxs = HtmlXPathSelector(response)
2nd_xpath = False
try:
if response.meta['index'] > 1:
2nd_xpath = True
index = response.meta['index']
except KeyError:
index = 0
sites = (hxs.select('//div[#class="icListItem"]') if 2nd_xpath
else hxs.select('//div[#class="abTbl "]'))
...
request = Request(next_url, self.parse, dont_filter=True)
request.meta['index'] = index + 1
yield request
That code sure as hell can be improved but you get the idea.