Scrapy only going through first 5 links with next_page_url

Scrapy only going through first 5 links with next_page_url - python

My code seemingly only goes through the first 5 links that are requested and then stops when the 6th is requested. I have tried to use start_urls and next_page_url. Both only extract from the first 5 pages given.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(tickers)
next_page_url = "https://finviz.com/"
html = response.xpath(
'//a[#class="screener_arrow"]/#href').extract()[0]
print(html)
next_page_url += html
print(next_page_url)
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
Here is the error statement:
Any help is appreciated.
EDIT:
I have updated the code, but still seem to get errors.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
from bs4 import BeautifulSoup
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = [
"https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=-change"]
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class': 'screener-link-primary'})])
second_part = good_html.find('a', {'class': 'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
stock_list = finvizSpider()
process = CrawlerProcess()
process.crawl(finvizSpider)
process.start()
list2 = stock_list.returnTickers()
I get the following error when this is run.

The line if next_page_url is not None: will never be None, You need to check if html is None.
The line next_page_url += html will give you an error when html is None, so first you need to check if it's None.
If html is None, then you can't do html[0], replace extract with extract_first (I used get).
Here is the fixed code:
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class FinvizSpider(scrapy.Spider):
name = "finviz"
urlcheck = 1
tickers = []
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
self.tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(self.tickers)
next_page_url = "https://finviz.com/"
html = response.xpath('//a[#class="screener_arrow"]/#href').get()
print(html)
if html is not None:
next_page_url += html
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in self.tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList

It looks like scrapy can only callback 5 times, so instead of callingback i would recommend to iterate over a list with all the links, you can do it with BeautifulSoup and it would be very simple.
Install
pip install BeautifulSoup4
BS4 import:
from bs4 import BeautifulSoup
Rest of code:
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class':'screener-link-primary'})])
second_part = good_html.find('a', {'class':'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.

Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Python scrapy returns uncomplete data

I am creating a scraper for web data scraping.
There are 58 pages and each page has 12 products.
The data should be return as 58 x 12 = 696 products title but it return only data of 404 products only. Here is my code
import scrapy
from fundrazr.items import FundrazrItem
from datetime import datetime
import re
class Fundrazr(scrapy.Spider):
name = "my_scraper"
# First Start Url
start_urls = ["https://perfumehut.com.pk/shop/"]
npages = 57
# This mimics getting the pages using the next button.
for i in range(2, npages + 1):
start_urls.append("https://perfumehut.com.pk/shop/page/"+str(i)+"")
def parse(self, response):
for href in response.xpath("//h3[contains(#class, 'product-title')]/a/#href"):
# add the scheme, eg http://
url = "" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = FundrazrItem()
# Getting Campaign Title
item['campaignTitle'] = response.xpath("//h1[contains(#class, 'entry-title')]/text()").extract()
yield item
Its a woocommerce website and first page is
https://perfumehut.com.pk/shop/
and other pages as pagination
https://perfumehut.com.pk/shop/page/2/
https://perfumehut.com.pk/shop/page/3/
and up to 58.
I want to know what I did wrong by getting npages ?
Regards

import scrapy
from fundrazr.items import FundrazrItem
from datetime import datetime
import re
class Fundrazr(scrapy.Spider):
name = "my_scraper"
# First Start Url
start_urls = ["https://perfumehut.com.pk/shop/"]
def parse(self, response):
data = FundrazrItem()
for item in response.xpath("//div[contains(#class, 'products elements-grid ')]/div[contains(#class, 'product-grid-item product ')]/h3/a"):
data['campaignTitle'] = item.xpath("./text()").extract_first()
yield data
next_page = response.xpath("//ul[#class='page-numbers']/li[last()]/a/#href").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, callback=self.parse)

Splash for Scrapy only returns empty list

I hope there's someone who can help a newbie:
I try to scrape the prices of https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html using Scrapy. Since those prices are loaded dynamically with Javascript I tried to use Splash to deal with the problem. But the outcome is still the same: Empty lists for the prices ( "hotel_displayed_prices"). The other items do all receive the correct values.
On the webpage I found two ways to get to the price with CSS selector:
.price-wrap .price :: text
.premium-offer-container div::attr(data-locationid)
both ways do not seem to work... or they do both and just splash does not.
for scrapy I copied all configurations from https://github.com/scrapy-plugins/scrapy-splash into my settings file. I did also put Robotstxt_obey = False
when rendering the website in Splash 3.4.1 (browser window) it showed me the price of the hotels so normally it should work I guess.
import scrapy
from ..items import TestItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Testdata"
start_urls = ["https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 5})
def parse(self, response):
items = TestItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid)").extract()
items["hotel_names"] = str(hotel_names).split("'")[1]
items["hotel_links"] = "https://www.tripadvisor.com" + str(hotel_links).split("'")[1]
items["hotel_ids"] = int(str(hotel_ids).split("_")[1].split("'")[0])
items["hotel_displayed_price"]= hotel_displayed_price
yield items

On this line
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid").extract()
Are you missing a closing bracket on "div::attr(data-locationid" ?

I've had a look at the behaviour under scrapy, and the prices are not returned in the HTML to a request from scrapy. What you're seeing in the browser (even Splash) is not the same as what your code is seeing.
I don't know scrapy well enough to work through this, but it seems possible to get what you need with plain old requests & BeautifulSoup:
import requests
import BeautifulSoup
r = requests.get('https://www.tripadvisor.ie/Hotels-g189541-Copenhagen_Zealand-Hotels.html')
soup = BeautifulSoup(requests.content, 'lxml')
prices = [price.text for price in soup.select('.price-wrap .price')]
print(prices)
['€131', '€112', '€121', '€133', '€172', '€169', '€74', '€189', ...]

For everyone with the similar problem: Here is my solution. However I do have problems with duplicates when I run the script.
import scrapy
from ..items import HotelinfoItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Hoteldata"
start_urls = ["http://localhost:8050/render.html?url=https:"
"//www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 10})
def parse(self, response):
items = HotelinfoItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-pernight)").extract()
hotel_type = entry.css(".mb10").css(".label::text").extract()
items["hotel_names"] = [str(hotel_names).split("'")[1]]
items["hotel_links"] = ["https://www.tripadvisor.com" + str(hotel_links).split("'")[1]]
items["hotel_ids"] = [str(hotel_ids).split("_")[1].split("'")[0]]
if len(hotel_type) == 0:
items["hotel_type"] = ["Hotel"]
else:
items["hotel_type"] = hotel_type
if len(hotel_displayed_price) == 0:
items["hotel_displayed_price"] = ["NA"]
else:
items["hotel_displayed_price"] = hotel_displayed_price
yield items
next_page = response.css("a.next::attr(href)").get()
next_page_splash = "http://localhost:8050/render.html?url=https://www.tripadvisor.com" + \
str(next_page).split("#")[0] + "&timeout=10&wait=5"
if next_page is not None:
yield response.follow(next_page_splash, callback=self.parse)

Crawl iframe and page at the same time

I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item

to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item

Webcrawler multiple page iteration

I want to make the crawler go to the next page to extract data any help on what to do. I am a little lost on what to do. I tried scrapy but it is kinda complicated and bs4 is more convenient.
import bs4 as bs
import urllib.request
import pandas as pd
import re
source = urllib.request.urlopen('https://messageboards.webmd.com/').read()
soup = bs.BeautifulSoup(source,'lxml')
df = pd.DataFrame(columns = ['link'],data=[url.a.get('href') for url in soup.find_all('div',class_="link")])
lists=[]
for i in range(0,33):
link = (df.link.iloc[i])
source1 = urllib.request.urlopen(link).read()
soup1 = bs.BeautifulSoup(source1,'lxml')
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))

So it looks like you're storing hrefs in a list
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
Now you actually have to do something with them. In this case I'm assuming you want to navigate to each href in your list.
for href in lists:
new_page = urllib.request.urlopen(href).read()
And then you can scrape whatever data you want out of new_page

I've got the same problem. Here is my code example for a page I crawled for exercise. I've chained multiple site requests to get detailed information.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from capterra.items import CapterraItem
class CapterraCatSpider(CrawlSpider):
name = 'capterra_cat'
#allowed_domains = ['http://www.capterra.com/categories']
start_urls = ['http://www.capterra.com/categories']
# rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
# )
def parse(self, response):
#TEMP
for category in response.css('ol.browse-group-list'):
#Debug: only elements of one category
if category.css('a::text').extract_first() == 'Yoga Studio':
i = CapterraItem()
#Get link to detail page
i['cat_name'] = category.css('a::text').extract_first()
#join link to detail page with base url
i['cat_link'] = response.urljoin(category.css('a::attr(href)').extract_first())
cat_link = i['cat_link']
print cat_link
#call request to detail page and pass response to parse_details method with callback method
request = scrapy.Request(cat_link, callback=self.parse_details)
request.meta['item'] = i
yield request
def parse_details(self,response):
#Debug print
print 'DETAILS!'
#read your items from response meta
item = response.meta['item']
#iterate over listings
for detail in response.css('p.listing-description.milli'):
item['profile_link'] = response.urljoin(detail.css('a.spotlight-link::attr(href)').extract_first())
#call request to profile page to get more information for listing
request = scrapy.Request(item['profile_link'], callback=self.parse_profile)
#set your item to rquest metadata
request.meta['item'] = item
yield request
def parse_profile(self,response):
#Debug print
print 'PROFILE'
item = response.meta['item']
item['product_name'] = response.css('h1.beta.no-margin-bottom::text').extract_first()
item['who_uses_software'] = response.css('div.spotlight-target > p.epsilon > i::text').extract_first()
item['vendor_name'] = response.css('h2.spotlight-vendor-name > span::text').extract_first()
return item

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy only going through first 5 links with next_page_url - python

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

Python scrapy returns uncomplete data

Splash for Scrapy only returns empty list

Crawl iframe and page at the same time

Webcrawler multiple page iteration

Categories

Resources