I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item
to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item
Related
My code seemingly only goes through the first 5 links that are requested and then stops when the 6th is requested. I have tried to use start_urls and next_page_url. Both only extract from the first 5 pages given.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(tickers)
next_page_url = "https://finviz.com/"
html = response.xpath(
'//a[#class="screener_arrow"]/#href').extract()[0]
print(html)
next_page_url += html
print(next_page_url)
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
Here is the error statement:
Any help is appreciated.
EDIT:
I have updated the code, but still seem to get errors.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
from bs4 import BeautifulSoup
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = [
"https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=-change"]
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class': 'screener-link-primary'})])
second_part = good_html.find('a', {'class': 'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
stock_list = finvizSpider()
process = CrawlerProcess()
process.crawl(finvizSpider)
process.start()
list2 = stock_list.returnTickers()
I get the following error when this is run.
The line if next_page_url is not None: will never be None, You need to check if html is None.
The line next_page_url += html will give you an error when html is None, so first you need to check if it's None.
If html is None, then you can't do html[0], replace extract with extract_first (I used get).
Here is the fixed code:
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class FinvizSpider(scrapy.Spider):
name = "finviz"
urlcheck = 1
tickers = []
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
self.tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(self.tickers)
next_page_url = "https://finviz.com/"
html = response.xpath('//a[#class="screener_arrow"]/#href').get()
print(html)
if html is not None:
next_page_url += html
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in self.tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
It looks like scrapy can only callback 5 times, so instead of callingback i would recommend to iterate over a list with all the links, you can do it with BeautifulSoup and it would be very simple.
Install
pip install BeautifulSoup4
BS4 import:
from bs4 import BeautifulSoup
Rest of code:
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class':'screener-link-primary'})])
second_part = good_html.find('a', {'class':'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
First of all thanks a lot for your help!
I don't know why I'm obtaining only two results per page. Could you please help me? Here is the code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['https://www.amazon.es']
start_urls = ['https://www.amazon.es/s/ref=sr_pg_2rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afe bi&page=1&keywords=febi&ie=UTF8&qid=1 535314254']
rules = {
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[h2]')),
callback = 'parse_item', follow = False)
}
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
def parse_item(self, response):
for mercado in response.xpath('//*[h2]'):
ml_item = MercadoItem()
ml_item['articulo'] = response.xpath("#title").extract()[0]
ml_item['precio'] = response.xpath("#href").extract()[0]
yield ml_item
You need to search relative to your mercado element:
def parse_item(self, response):
for mercado in response.xpath('//*[h2]'):
ml_item = MercadoItem()
ml_item['articulo'] = mercado.xpath("#title").extract()[0]
ml_item['precio'] = mercado.xpath("#href").extract()[0]
yield ml_item
I´ve been able to import more than two results.
Take a look at the code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider
import urllib
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['https://www.amazon.es']
start_urls = ['https://www.amazon.es/s/ref=sr_pg_2rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1 535314254']
def start_requests(self):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page=1&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
for i in range(2,400):
yield scrapy.Request("https://www.amazon.es/s/ref=sr_pg_2?rh=n%3A1951051031%2Cn%3A2424922031%2Ck%3Afebi&page="+str(i)+"&keywords=febi&ie=UTF8&qid=1535314254",self.parse_item)
def parse_item(self, response):
namelist = response.xpath('//a[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#title').extract()
#precio = response.xpath('//a/span[#class="a-size-base a-color-price s-price a-text-bold"]').extract()
listlength = len(namelist)
for i in range(0,listlength):
ml_item = MercadoItem()
ml_item['articulo'] = response.xpath('//a[#class="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/#title').extract()
#ml_item['precio'] = response.xpath('//a/span[#class="a-size-base a-color-price s-price a-text-bold"]').extract()
yield ml_item
Now at every csv line I obtain all the results of every page. My problem now comes from add the price of every article. I´m stucked here. How can I add it?
I'm scraping a site by going through the search page, then looping through all results within. However it only seems to be returning the first result for each page. I also don't think it's hitting the start page's results either.
Secondly, the price is returning as some sort of Unicode (£ symbol) - how can I remove it altogether just leaving the price?
'regular_price': [u'\xa38.59'],
Here is the HTML:
http://pastebin.com/F8Lud0hu
Here's the spider:
import scrapy
import random
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from cdl.items import candleItem
class cdlSpider(CrawlSpider):
name = "cdl"
allowed_domains = ["www.xxxx.co.uk"]
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(
allow=['advanced_search_result\.php\?sort=2a&page=\d*']),
callback='parse_listings',
follow=True)
]
def parse_listings(self, response):
sel = Selector(response)
urls = sel.css('a.product_img')
for url in urls:
url = url.xpath('#href').extract()[0]
return scrapy.Request(url,callback=self.parse_item)
def parse_item(self, response):
candle = candleItem()
n = response.css('.prod_info_name h1')
candle['name'] = n.xpath('.//text()').extract()[0]
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price').xpath('.//text()').extract()
else:
candle['was_price'] = response.css('.was_price strong').xpath('.//text()').extract()
candle['now_price'] = response.css('.now_price strong').xpath('.//text()').extract()
candle['referrer'] = response.request.headers.get('Referer', None)
candle['url'] = response.request.url
yield candle
Yes it's returning only the first result because of your parse_listing method (you're returning the first url and you should be yielding it). I would do something like:
def parse_listings(self, response):
for url in response.css('a.product_img::attr(href)').extract():
yield Request(url, callback=self.parse_item)
In that case I would even do something like:
class CdlspiderSpider(CrawlSpider):
name = 'cdlSpider'
allowed_domains = ['www.xxxx.co.uk']
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(allow='advanced_search_result\.php\?sort=2a&page=\d*')),
Rule(LinkExtractor(restrict_css='a.product_img'), callback='parse_item')
]
def parse_item(self, response):
...
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price::text').re_first(r'\d+\.?\d*')
else:
candle['was_price'] = response.css('.was_price strong::text').re_first(r'\d+\.?\d*')
candle['now_price'] = response.css('.now_price strong::text').re_first(r'\d+\.?\d*')
...
return candle
To remove the £, just replace it with an empty string like this:
pricewithpound = u'\xa38.59'
price = pricewithpound.replace(u'\xa3', '')
To investigate the scrapy issue, can you please provide the HTML source ?
I am trying to extract information from Listing and Detail pages.
The code below correctly scrapes the reviewer information from the Listing page and all linked pages (where a contains Next)
The detail_pages Urls are also captured. e.g. http://www.screwfix.com/p/prysmian-6242y-twin-earth-cable-2-5mm-x-100m-grey/20967
However I cannot see how I can navigate to and scrape the information from the Detail pages.
Is there anyone here who used Scrapy successfully who can help me to finish this spider?
Thank you for the help.
I include the code for the spider below:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from hn_scraper.items import HnArticleItem
class ScrewfixSpider(Spider):
name = "Screwfix"
allowed_domains = ["www.screwfix.com"]
start_urls = ('http://www.screwfix.com/', )
link_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//a[contains(., "Next")]', ))
detail_page_extractor = SgmlLinkExtractor(
allow=('www', ),
restrict_xpaths=('//tr[#id[contains(., "reviewer")]]/td[3]/a', ))
def extract_one(self, selector, xpath, default=None):
extracted = selector.xpath(xpath).extract()
if extracted:
return extracted[0]
return default
def parse(self, response):
for link in self.link_extractor.extract_links(response):
request = Request(url=link.url)
request.meta.update(link_text=link.text)
yield request
for item in self.parse_item(response):
yield item
def parse_item(self, response):
selector = Selector(response)
rows = selector.xpath('//table[contains(.,"crDataGrid")]//tr[#id[contains(., "reviewer")]]')
for row in rows:
item = HnArticleItem()
reviewer = row.xpath('td[3]/a')
reviewer_url = self.extract_one(reviewer, './#href', '')
reviewer_name = self.extract_one(reviewer, 'b/text()', '')
total_reviews = row.xpath('td[4]/text()').extract()
item['url'] = reviewer_url
item['name'] = reviewer_name
item['total_reviews'] = total_reviews
yield item
detail_pages = self.detail_page_extractor.extract_links(response)
if detail_pages:
print 'detail_pages'
print detail_pages[0].url
yield Request(detail_pages[0].url)
I am trying to use scrapy to scrape a website that has several pages of information.
my code is:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[#class='magicCard']")
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
I am trying to scrape all the pages until it reaches the end of the pages ... sometimes there will be more pages than others so its hard to say exactly where the page numbers end.
The idea is to increment pageNumber until there is no titles found. If no titles on the page - throw CloseSpider exception to stop the spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item
URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["tcgplayer.com"]
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
print self.page_number
print "----------"
sel = Selector(response)
titles = sel.xpath("//div[#class='magicCard']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
self.page_number += 1
yield Request(URL % self.page_number)
This particular spider would go throw all 8 pages of the data, then stop.
Hope that helps.