Scrapy Webcrawler and Data Extractor - python

I am trying to create a webcrawler with scrapy, i am using a template that i have used before but i cant seem to get it to parse the urls. I can see it go to youtube and then go to the watchpage but from there it wont pull the title or desciptions or anything because it always fails to parse.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
from krakenkrawler.items import KrakenItem
class AttractionSpider(CrawlSpider):
name = "thekraken"
allowed_domains = ["youtube.com"]
start_urls = [
"http://www.youtube.com/?gl=GB&hl=en-GB"
]
rules = ()
def __init__(self, name=None, **kwargs):
super(AttractionSpider, self).__init__(name, **kwargs)
self.items_buffer = {}
self.base_url = "http://www.youtube.com"
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 360
def parse(self, response):
print "Start scrapping Attractions...."
try:
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[#class='yt-lockup-title']//a/#href")
if not links:
return
log.msg("No Data to scrap")
for link in links:
v_url = ''.join( link.extract() )
if not v_url:
continue
else:
_url = self.base_url + v_url
yield Request( url= _url, callback=self.parse_details )
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
def parse_details(self, response):
print "Start scrapping Detailed Info...."
try:
hxs = HtmlXPathSelector(response)
l_venue = KrakenItem()
v_name = hxs.select("//*[#id='eow-title'].text").extract()
if not v_name:
v_name = hxs.select("//*[#id='eow-title'].text").extract()
l_venue["name"] = v_name[0].strip()
base = hxs.select("//*[#id='content']/div[7]")
if base.extract()[0].strip() == "<div style=\"clear:both\"></div>":
base = hxs.select("//*[#id='content']/div[8]")
elif base.extract()[0].strip() == "<div style=\"padding-top:10px;margin-top:10px;border-top:1px dotted #DDD;\">\n You must be logged in to add a tip\n </div>":
base = hxs.select("//*[#id='content']/div[6]")
x_datas = base.select("div[1]/b").extract()
v_datas = base.select("div[1]/text()").extract()
i_d = 0;
if x_datas:
for x_data in x_datas:
print "data is:" + x_data.strip()
if x_data.strip() == "<b>Address:</b>":
l_venue["address"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Contact:</b>":
l_venue["contact"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Operating Hours:</b>":
l_venue["hours"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Website:</b>":
l_venue["website"] = (base.select("//*[#id='watch-actions-share-panel']/div/div[2]/div[2]/span[1]/input/text()").extract())[0].strip()
i_d += 1
v_photo = base.select("img/#src").extract()
if v_photo:
l_venue["photo"] = v_photo[0].strip()
v_desc = base.select("div[3]/text()").extract()
if v_desc:
desc = ""
for dsc in v_desc:
desc += dsc
l_venue["desc"] = desc.strip()
v_video = hxs.select("//*[#id='content']/iframe/#src").extract()
if v_video:
l_venue["video"] = v_video[0].strip()
yield l_venue
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
Thanks a ton in advance.

The problem is that the structure you are looking for "//h3[#class='yt-lockup-title']//a/#href" is not present in all pages.
I modified your code to verify what pages are opened and what data are extracted:
class AttractionSpider(CrawlSpider):
name = "thekraken"
bot_name = 'kraken'
allowed_domains = ["youtube.com"]
start_urls = ["http://www.youtube.com/?gl=GB&hl=en-GB"]
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
print "Start scrapping Attractions...."
print response.url
try :
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[#class='yt-lockup-title']//a/#href")
for link in links:
v_url = ''.join( link.extract() )
print v_url
if not links:
log.msg("No Data to scrap")
except :
pass
Result is something like this:
Start scrapping Attractions....http://www.youtube.com/watch?v=GBdCbciGLK0
Start scrapping Attractions....http://www.youtube.com/watch?v=BxUjDpnSHyc&list=TL4PEfm95Wz3k
Start scrapping Attractions.... http://www.youtube.com/watch?v=T-CZW4YjAig
Start scrapping Attractions....
https://www.youtube.com/user/ComedyShortsGamer
/watch?v=TdICODRvAhc&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=CDGzm5edrlw&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=F2oR5KS54JM&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=LHRzOIvqmQI&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=F4iqiM6h-2U&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=ug3UPIvWlvU&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=msiZs6lIZ9w&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=Jh6A3DoOLBg&list=UUrqsNpKuDQZreGaxBL_a5Jg
In the inner pages where no results are scraped there are no "yt-lockup-title" classes.
In brief you have to improve your spider.

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.
Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

Crawl iframe and page at the same time

I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item
to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item

python web recursive scraping error

I am trying to scrape multiple pages. Its structured like this:
--> Page 1 - Scrape links
-------> Page 2 - Scrape more links (some pages contain pagination) and data
------------> Page 3 - Scrape the data
It is returning 18 items, but there are 127 pages (in 2nd step) and 18 items/page. And not returning author and author_link in item.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor as lext
from scrapy.selector import Selector
from scrapy.http import Request
from ror.items import RorItem
class RorSpiderSpider(CrawlSpider):
name = "ror_spider"
allowed_domains = ["example.com"]
start_urls = (
'http://www.example.com/',
)
rules = [
Rule(lext(allow=("http://www.example.com/$"), restrict_xpaths=('//a[#class="nextpostslink"]',)), callback='parse', follow=True),
]
def parse(self, response):
links = Selector(response).xpath('//ul[#id="nav"]/li')
for link in links:
item = RorItem()
item['menu_link'] = link.xpath('a/#href').extract()[0]
item['menu_title'] = link.xpath('a/text()').extract()[0]
if "http" not in item['menu_link']:
item['menu_link'] = "http://www.reviewofreligions.org" + ''.join(item['menu_link'])
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
else:
yield Request(url=item['menu_link'], meta={'item': item}, callback=self.parse_articles)
def parse_articles(self, response):
sel = Selector(response)
item = response.meta['item']
if "articles" in item['menu_link']:
item['link_cat'] = item['menu_title']
pg = 1
maxPgs = 124
while pg <= 124:
item['article_pg_link'] = item['menu_link'] + "page/" + str(pg) + "/"
article_links = sel.xpath('//div[#id="rightcol"]/div[#class="articlebox"]')
for art_link in article_links:
item['article_link'] = art_link.xpath('a[#class="title "]/#href').extract()[0]
item['article_title'] = art_link.xpath('a[#class="title "]/text()').extract()[0].replace('\n\t\t\t\t', '').replace('\t\t\t\t', '')
# article_txt_1 = art_link.xpath('text()').extract()[1].replace('\n \n\t\t\t\t', '').replace('\t\t\t\t', '').replace('\n \n', '')
# article_txt_2 = art_link.xpath('text()').extract()[2].replace('\n \n\t\t\t\t', '') if art_link.xpath('text()').extract()[2] else ''
# item['article_txt'] = article_txt_1 + '\n'.join(article_txt_2).replace('\n\n\n \n\n\n \n \n \n \n\n\n\t\n\t\n\t', '')
yield Request(url=item['article_link'], meta={'item': item}, callback=self.article_page)
pg += 1
def article_page(self, response):
select = Selector(response)
item = response.meta['item']
item['author'] = select.xpath('//div[#id="author"]/a/text()').extract()
item['author_link'] = select.xpath('//div[#id="author"]/a/#href').extract()
return item
What is wrong in the code?

Combine FormRequest and CrawlSpider

I need to apply FormRequest [From here][1]:
#Request = FormRequest.from_response(
# response,
# formname='frmSearch',
# formdata={'classtype': 'of'},
# #callback=self.parse_links,
# dont_filter=True,
#
# )
For link in start_urls and to all pages that I get from the rules in my СrawlSpider.
class QuokaSpider(CrawlSpider):
name = 'quoka'
allowed_domains = ['www.quoka.de']
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen/']
curr_page = 0
rules = (Rule(LinkExtractor(allow=(r'.+'), restrict_xpaths = [u'//li[#class="arr-rgt active"]',]),
follow=True, callback='parse_links'),
)
def _url(self, url):
return 'http://www.quoka.de' + url
def parse_links(self, response):
hxs = Selector(response)
lnks = hxs.xpath('//a[contains(#class, "img-lmtr") and contains(#class, "multi") or contains(#class, "single")]/#href').extract()
filters = hxs.xpath(u'//div[#class="modal-title"]/text()').extract()
for fil in filters:
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+fil+"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
for url in lnks:
request = Request(self._url(url), callback=self.parse_object)
yield request
def parse_object(self, response):
item = AnbieterItem()
hxs = Selector(response)
item['Beschreibung'] = hxs.xpath(u'//div[#class="text"]/text()').extract()
# item['Kleinanzeigen_App'] = '1'
# item['Preis'] = '1'
return item
If I try to use "start_request" to the filter, the spider does not use pages from the rules.
How can I solve this problem and apply this filter to start url and urls from rules?
I don't know how to combine CrawlSpider Rules with FormRequest but I'd like to suggest that you replace the CrawlSpider with a generic Spider and create the Requests manually.
The Rule in your code does only take care of following the pagination (as far as i can see). To replace that you could use something like in the following code sample:
import scrapy
class TestSpider(scrapy.Spider):
name = 'quoka'
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen']
def parse(self, response):
request = scrapy.FormRequest.from_response(
response,
formname='frmSearch',
formdata={'classtype': 'of'},
callback=self.parse_filtered
)
print request.body
yield request
def parse_filtered(self,response):
resultList = response.xpath('//div[#id="ResultListData"]/ul/li')
for resultRow in resultList:
xpath_Result_Details = './/div[#class="q-col n2"]/a'
# Check if row has details
if resultRow.xpath(xpath_Result_Details):
result_Details = resultRow.xpath(xpath_Result_Details)
# If YES extract details
title = result_Details.xpath('./#title').extract()
href = result_Details.xpath('./#href').extract()[0]
# Code to request detail pages goes here ...
print title, href
# Use this instead of CrawlSpider to follow the pagination links
xpath_NextPage = '//div[#class="rslt-pagination"]//li[#class="arr-rgt active"]/a'
if response.xpath(xpath_NextPage):
nextPage_href = response.xpath(xpath_NextPage + '/#href').extract()[0]
nextPage_url = 'http://www.quoka.de/immobilien/bueros-gewerbeflaechen' + nextPage_href
nextPage_num = response.xpath(xpath_NextPage + '/#data-qng-page').extract()[0]
# request = scrapy.Request(nextPage_url, callback=self.parse_filtered)
# Create request with formdata ...
request = scrapy.FormRequest.from_response(
response,
formname='frmNaviSearch',
formdata={'pageno': nextPage_num},
callback=self.parse_filtered
)
yield request

scraping multiple pages with scrapy

I am trying to use scrapy to scrape a website that has several pages of information.
my code is:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[#class='magicCard']")
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
I am trying to scrape all the pages until it reaches the end of the pages ... sometimes there will be more pages than others so its hard to say exactly where the page numbers end.
The idea is to increment pageNumber until there is no titles found. If no titles on the page - throw CloseSpider exception to stop the spider:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from tcgplayer1.items import Tcgplayer1Item
URL = "http://store.tcgplayer.com/magic/journey-into-nyx?pageNumber=%d"
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["tcgplayer.com"]
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
print self.page_number
print "----------"
sel = Selector(response)
titles = sel.xpath("//div[#class='magicCard']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
item["shipping"] = vendor.xpath("normalize-space(.//span[#class='shippingAmount']/text())").extract()
item["condition"] = vendor.xpath("normalize-space(.//td[#class='condition']/a/text())").extract()
item["vendors"] = vendor.xpath("normalize-space(.//td[#class='seller']/a/text())").extract()
yield item
self.page_number += 1
yield Request(URL % self.page_number)
This particular spider would go throw all 8 pages of the data, then stop.
Hope that helps.

Categories

Resources