Web Crawler not following the links - python

I want to crawl news site using Scrapy. The code retrieved related news from current link but not following the next page links. The news site has following link property
The code I am following :
import scrapy
class fakenews(scrapy.Spider):
name = "bb8"
allowed_domains = ["snopes.com"]
start_urls = [
"https://www.snopes.com/fact-check/category/science/"
]
custom_settings = {'FEED_URI': "fakenews_%(time)s.csv",
'FEED_FORMAT': 'csv'}
def parse(self, response):
name1 = input(" Please enter input : ")
name1 = name1.lower()
links =response.xpath("//div[#class='media-list']/article/a/#href").extract()
headers = response.xpath('//div[#class="media-body"]/h5/text()').extract()
headers1 = [c.strip().lower() for c in headers]
raw_data=zip(headers1,links)
for header, link in raw_data:
p = header
l=link
if name1 in p:
scrap_info3 = {'page': response.url, 'title': header, 'link':l}
yield scrap_info3
next_page = response.css("//a[#class='btn-next btn']/#href").get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
Though from current page it return information but also showing error.
For input I entered: NASA

The main error is that you have css function and xpath selector for next_page:
next_page = response.css("//a[#class='btn-next btn']/#href").get()
The next problem is that you have yielding request of next page inside for cycle. This will lead to calling a lot of duplicate request.
So I suppose these changes:
def parse(self, response):
name1 = input(" Please enter input : ")
name1 = name1.lower()
links = response.xpath("//div[#class='media-list']/article/a/#href").extract()
headers = response.xpath('//div[#class="media-body"]/h5/text()').extract()
headers1 = [c.strip().lower() for c in headers]
# my changes since this moment:
raw_data = zip(headers1, links)
# use less variables in loop (yes, just cosmetic, but your code will more readable)
for header, link in raw_data:
if name1 in header:
yield {'page': response.url, 'title': header, 'link': link}
# use proper selector here
next_page = response.css("a.btn-next::attr(href)").get()
# move all this block out of for loop
if next_page:
yield response.follow(next_page)

Related

Extracting next page and setting a break

I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]

Problems getting next page when scraping with scrapy

I have a scrapy code which doesn't crawl pagination links and i'm stuck.
The source of the page is:
https://www.levenhuk.bg/katalog/teleskopi/?page=1
My code is:
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
I feel like the problem is simply that you are not specifying a callback in your pagination request. Specify your parse function as callback and that should work. please comment if it still doesn't work.
Edit:
In this case I feel like your logic needs an overhaul. I suggest separating the pagination and item extraction login. Try the following:
def parse(self, response):
self.extract_item(response)
next_page_urls = response.xpath('//*[#class="pagesCount"] [1]//#href').getall()
if next_page_urls is not None:
for url in next_page_urls:
yield scrapy.Request(response.urljoin(url), callback=self.extract_item)
def extract_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
# 'name': product.xpath('.//span[#itemprop="name" and contains(text(), "Levenhuk")]/text()').get(),
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
# 'price': product.xpath('.//div[#class="price"]/span/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
so now the parse function handles pagination and the extract_item function extracts items for every page.
Modify allowed_domains as well as specified by Pasindu.
Change this to :
allowed_domains = ['https://www.levenhuk.bg/']
allowed_domains = ['levenhuk.bg']
You also need to change:
next_page_url = response.xpath('//*[#class="pagesCount"][1]//#href').get()
This will only work for the first page, for page 2,3,4.., this will extract a link to the first page.
And also add a callback as mentioned by UzairAhmed.
This is a little tricky since usually standard practice is to just check if there is a next page button on a loop until there isn't.
Here's an example since there is no next page button we can figure out the total page count. There will be a duplicate request to page1 though with this method its not the most ideal situation.
import scrapy
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page=1']
download_delay = 3
def parse(self, response):
total_pages = response.css('.pagesCount a::text')[-1].get()
total_pages = int(total_pages)
pages_str = str(total_pages)
for i in range(1, total_pages):
url = 'https://www.levenhuk.bg/katalog/teleskopi/?page={}'.format(pages_str)
yield scrapy.Request(url, callback=self.parse_item, dont_filter=True)
def parse_item(self, response):
for product in response.xpath('//div[#class="catalog-item"]'):
yield {
'name': product.xpath('.//span[#itemprop="name"]/text()').get(),
'price': product.xpath('.//span[#itemprop="price"]/text()').re_first(r'[0-9]+,[0-9]+'),
'short_discr': product.xpath('.//div[#class="opis-item"]/p/strong/text()').get()
}
Another method of doing this would be to just look at how many pages there are and over ride your start_requests method as follows:
class TelescopesSpider(scrapy.Spider):
name = 'telescopes'
allowed_domains = ['https://www.levenhuk.bg/']
start_urls = ['https://www.levenhuk.bg/katalog/teleskopi/?page={}']
download_delay = 3
def start_requests(self):
for i in range(1, 14):
yield scrapy.Request(self.start_urls[0].format(str(i)), callback=self.parse)

Web Scraping all Urls from a website with Scrapy and Python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

Python Scrapy scrape data from nested pages

i have a made a scraper that scrapes data from a website that have the data nested, i mean that to get to the data page i have to click 5 links then i get to the data page where i scrape the data
For every 1st page there are multiple page 2's for every page 2's there are many page 3's and so on
so here i have a parse function for opening each page until i get to the page that has the data and add the data to the item class ad return the item.
But it is skipping a lot of links without scraping data. It is not executing the last parse_link function after 100 or so links*. Well how do i know the parse_link function is not executing ?
it is because i am printing print '\n\n', 'I AM EXECUTED !!!!' and it is not printing after 100 or so links but the code executes parse_then every time
what i want to know is am i doing it right ? is this the right aproch to scrape a website like this
here is the code
# -*- coding: utf-8 -*-
import scrapy
from urlparse import urljoin
from nothing.items import NothingItem
class Canana411Spider(scrapy.Spider):
name = "canana411"
allowed_domains = ["www.canada411.ca"]
start_urls = ['http://www.canada411.ca/']
PAGE 1
def parse(self, response):
SET_SELECTOR = '.c411AlphaLinks.c411NoPrint ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_next)
PAGE 2
def parse_next(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_more)
PAGE 3
def parse_more(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_other)
PAGE 4
def parse_other(self, response):
SET_SELECTOR = '.clearfix.c411Column.c411Column3 ul li'
for attr in response.css(SET_SELECTOR):
linkse = 'a ::attr(href)'
link = attr.css(linkse).extract_first()
link = urljoin(response.url, link)
yield scrapy.Request(link, callback=self.parse_then)
PAGE 5
def parse_then(self, response):
SET_SELECTOR = '.c411Cities li h3 a ::attr(href)'
link = response.css(SET_SELECTOR).extract_first()
link = urljoin(response.url, link)
return scrapy.Request(link, callback=self.parse_link)
PAGE 6 THE DATA PAGE
def parse_link(self, response):
print '\n\n', 'I AM EXECUTED !!!!'
item = NothingItem()
namese = '.vcard__name ::text'
addressse = '.c411Address.vcard__address ::text'
phse = 'span.vcard__label ::text'
item['name'] = response.css(namese).extract_first()
item['address'] = response.css(addressse).extract_first()
item['phone'] = response.css(phse).extract_first()
return item
am i doing it right, or is there is a better way that i am missing ?
If there's no conflict (e.g. 1st page cannot contain selectors and links to 3rd and should take into consideration from any page except 2nd or something alike) I'd recommend to flatten rules to extract links. Thus one parse would be enough.

How can i make scrapy to process the url sequentially

I have this code
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="headline_area"]')
items = []
for site in sites[:5]:
item = StackItem()
log.msg(' LOOP' +str(ivar)+ '', level=log.ERROR)
item['title'] ="yoo ma"
request = Request("blabla", callback=self.test1)
request.meta['item'] = item
page_number = nextlink.split("&")[-1].split("=")[-1]
if int(page_number) > 500:
raise CloseSpider('Search Exceeded 500')
ivar = ivar + 1
yield request
mylinks= soup.find_all('a')
if mylinks:
nextlink = mylinks[0].get('href')
page_number = nextlink.split("&")[-3].split("=")[-1]
request = Request(urljoin(response.url, nextlink), callback=self.parse)
request.meta['page'] = page_number
yield request
Now my problem is that suppose i want to stop at page_number = 5
now scrappy goes to that page before the all items from page 1 , page 2 etc are downloaded and stops when it first reaches there.
How can get rid of that porblem that it prcess all links before going to page = 5
Does the link has some regularity on different page? For example, if the 5th page's link is www.xxxx.net/nForum/#!article/Bet/447540?p=5. You can scrappy link with p=5 directly.
You can use the inline_requests decorator.

Categories

Resources