recursive web crawling in python - python

this is my code :
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
I want to that continue in others pages. mean its continue counting until there isn't any link.
When I calling the check function, thats not worked !
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
lnks = parsed_html.find_all('a')
self.check(lnks)
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
I added these lines to it :
lnks = parsed_html.find_all('a')
self.check(lnks)
this time, the loop executed only one time !

Try something like this:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("a/text()").extract()
item["link"] = titles.xpath("a/#href").extract()
items.append(item)
return(items)

Related

Scrapy - ValueError: Missing scheme in request url: #mw-head

I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated
Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]
row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()

Reducing data usage of Scrapy Spider/How to be a better internet citizen?

I recently resurrected an old scrapy spider (0.18.4) that crawls craigslist. The spider grabs links from the first page of apartment listings every 15mins, and scrapes the links of new postings. (see code below)
This spider was downloading >1GB of data every 15min! I was able to reduce it by half, but this is still a lot of load on the craigslist site. Please note, this is from the spider alone, as I tested without pipelines enabled. I have also tried limiting through MAX_DEPTH and DOWNLOAD_MAXSIZE in settings.py, spider class settings, and individual follow on requests meta settings. I have spent several hours in the docs, messed around with Rules, updated to the latest version of scrapy, etc.; all to no avail. Granted, I wrote this script several years ago when I was new to python, but perhaps my folly can be the comunities gain...
Given my code below, what can I, or anyone else using scrapy, do to reduce the amount of data I'm downloading, when I only care about a couple kB of text???? Which calls are data greedy?
Spider:
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider
#from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from craigslist_2br.items import CraigslistItem
from craigslist_2br import craigslistTools
from scrapy.http import Request
from urlparse import urljoin
import types
import exceptions
import datetime
import ipdb as pdb
#from ghost import Ghost
class Suppressor:
def __init__(self, exception_type,localList):
self._exception_type = exception_type
self.safe_dict = dict([ (k, locals().get(k, None)) for k in localList ])
def __call__(self, expression):
try:
exec(expression, {"__builtins__" : None}, self.safe_dict)
except self._exception_type as e:
print 'Suppressor: suppressed exception %s with content \'%s\'' % (type(self._exception_type), e)
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/apa/"]
#g=Ghost()
def parse(self, response):
hxsM = HtmlXPathSelector(response)
##titles = hxs.select('//a[#class="result-title hdrlnk"]')
postings=hxsM.select('//p[#class="result-info"]')
resultRows=hxsM.select('//li[#class="result-row"]')
try:
IDf=open("/home/dapper/python/scrapy/scrapy_craig2br/craigslist_2br/craigslist_2br/postingIDs.txt","r")
IDvec=[int(sl.strip("\n")) for sl in IDf.readlines()]
except:
IDvec=[]
finally:
try:
IDf.close()
except:
pass
for posting,resultRow in zip(postings,resultRows):
item = CraigslistItem()
try:
item["ID"]=posting.select("a/#data-id").extract()[0]
if (int(item["ID"])==int(resultRow.select("#data-pid").extract()[0])):
try:
item["repostID"]=resultRow.select("#data-repost-of").extract()[0]
except:
item["repostID"]=''
else:
item["repostID"]=''
except:
item["ID"]=''
item["repostID"]=''
try:
item ["price"] = posting.select("span[#class='result-meta']/span[#class='result-price']/text()").extract()[0]
except:
item ["price"] = 0
try:
item ["neighborhood"] = posting.select("span[#class='result-meta']/span[#class='result-hood']/text()").extract()[0]
except:
item ["neighborhood"] = ''
if ((not any(item["ID"]) or (int(item["ID"]) not in IDvec)) and
(craigslistTools.FixPriceStr(item['price'])<3000.0) and
(not((any(item["repostID"]) and (int(item["repostID"]) in IDvec)) and (any(item['neighborhood']) and craigslistTools.outDaHoods(item['neighborhood']))))):
#s = Suppressor(exception_type=exceptions.IndexError,localList=[item,titles,postings]) # TODO: put your exception type here
item ["title"] = posting.select("a/text()").extract()[0]
item ["link"] = posting.select("a/#href").extract()[0]
item ["dateAdded"] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")
try:
item ["size"] = posting.select("span[#class='result-meta']/span[#class='housing']/text()").extract()[0]
except:
item ["size"] = ''
if item['link']:
if 'http://' not in item['link']:
if isinstance(item['link'], types.StringTypes):
item['link'] = urljoin(response.url, item['link'])
else:
item['link'] = urljoin(response.url, item['link'][0])
yield Request(item['link'],
meta={'item': item,'download_maxsize':8**8,'depth_limit':1},
callback=self.anchor_page)
def anchor_page(self, response):
hxs = HtmlXPathSelector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['postingbody']=hxs.select("//section [#id='postingbody']").extract()
try:
latit=float(hxs.select("//div [#id='map']//#data-latitude").extract()[0])
longi=float(hxs.select("//div [#id='map']//#data-longitude").extract()[0])
#print '%f,%f'%(latit,longi)
except:
latit=0
longi=0
try:
old_item["address"] = hxs.select(".//div [#class='mapaddress']/text()").extract()[0]
except:
old_item["address"] = []
try:
if any(latit)&(not any([char.isdigit() for char in old_item["address"][0]])):
try:
old_item["address"] = craigslistTools.coord2addr(latit,longi)
except:
pass
elif (not any(old_item["address"])):
try:
zips=[94611,94610,94609,94606,94618,94705]
for z in zips:
for line in old_item['postingbody'].split("\n"):
if str(z) in line:
old_item["address"] = line
except:
pass
except:
pass
if (latit==0)&(any(old_item["address"])):
try:
cities=['Oakland','oakland','Piedmont','piedmont','Berkeley','berkeley','montclair','Montclair']
if not any([c in old_item["address"] for c in cities]):
old_item["address"]+=', Oakland'
geoRes=Geocoder.geocode(old_item["address"])
latit=geoRes[0].latitude
longi=geoRes[0].longitude
except:
pass
old_item["latit"] = latit
old_item["longi"] = longi
try:
(bartDist,bartCoord)=craigslistTools.bartDist((latit,longi))
except:
bartDist=0
try:
if ((bartDist!=0) and (bartDist<2)):
bartWalkingTime=craigslistTools.bartWalking((latit,longi),bartCoord)
else:
bartWalkingTime=0
except:
bartWalkingTime=0
old_item["bartDist"] = bartDist
old_item["BartWalkingTime"] = bartWalkingTime
try:
if ((bartDist<1) and (bartDist!=0)):
old_item['LLNLDrivingTime']=craigslistTools.LLNLdriving((latit,longi))
else:
old_item['LLNLDrivingTime']=0
except:
old_item['LLNLDrivingTime']=0
try:
old_item["Br"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[0].strip('BR'))
old_item["baths"]=int(hxs.select("//p [#class='attrgroup']/span/b/text()").extract()[1].strip('Ba'))
except:
try:
old_item["Br"]=int(re.findall(r"(\d+)br",old_item['size'])[0])
old_item["baths"]=1
except:
old_item["Br"]=-1
old_item["baths"]=-1
try:
old_item['amenities']=hxs.select("//p [#class='attrgroup']/span/text()").extract()
except:
old_item['amenities']=[]
yield old_item
settings.py:
# Scrapy settings for craigslist_2br project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
BOT_NAME = 'craigslist_2br'
SPIDER_MODULES = ['craigslist_2br.spiders']
NEWSPIDER_MODULE = 'craigslist_2br.spiders'
ITEM_PIPELINES = {'craigslist_2br.pipelines.Craigslist2BrPipeline':0}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'craigslist_2br (+http://www.craigslist.org)'
DOWNLOAD_DELAY = 5
RANDOMIZE_DOWNLOAD_DELAY = False
CONCURRENT_REQUESTS_PER_DOMAIN = 1 # Default: 8
#SCHEDULER = 'scrapy.core.scheduler.Scheduler'
#HTTPCACHE_ENABLED = True
DEPTH_LIMIT = 1
Good news! The problem I experienced will likely not effect you (just what you wanted to read when searching for answers ;) ) Using the Depreciated HtmlXPathSelector from v0.18 can cause major issues (surprise surprise), so don't do it.

Scrapy crawler does not scrape or print results in CSV

Seems like this scrapy spider locates the links that it is supposed to go to in order to collect additional information, but it either doesn't go to the next page or it is unable to collect the information on the other page. I checked the xpath links, they all appear to be correct.
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`
Obviously this part of code
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
doesn't emit any link. Perhaps you meant if 'Feedback' not in name
Also note that XPath here digest.xpath('//a[#target="_blank"]/text()') applied to DOM overall, not only part previously extracted for digest. If you'd like to apply XPath to digest selector you should rather use leading dot in xpath like this digest.xpath('.//a[#target="_blank"]/text()')

Scrapy Callback Function not scraping the Entire Data?

First of all this is my code-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
#from scrapy import log, signals
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import re
#query=raw_input("Enter a product to search for= ")
query='apple'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
add = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.bestmercato.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp=("https://www.bestmercato.com/index.php?route=product/search&search="+query1+"&page="+str(i))
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
# p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
items = []
for sel in response.xpath('//html/body/div/div/div[4]/div/div/div[5]/div'):
item = DmozItem()
item['productname'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/text()').extract())[3:-2]
item['product_link'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/#href').extract())[3:-2]
point1 = sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]').extract()
point = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/#class').extract())[3:-2]
checker = "options" in point
item['current_price'] = ""
if checker:
i=1
p=1
while i==1:
t = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/div/select/option['+str(p)+']/text()').extract())[3:-2]
#print t
if 'Rs' not in t:
i = 2
elif 'Rs' in t:
i = 1
t= " ".join(t)
s = t.translate(None, '\ t')[:-2]
item['current_price'] = item['current_price'] + ' ; ' + s
p = p+1
item['current_price'] = item['current_price'][3:-3]
else:
item['current_price'] = 'Rs. ' + str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[not (#class="name") or not(#class="description") or not(#class="qty") or not(#class="box_btn_icon")]/text()').extract())[46:-169]
re.findall(r"[-+]?\d*\.\d+|\d+", item["current_price"])
try:
test1 = str(sel.xpath('div/div[2]/div[3]/span[1]/text()').extract())[3:-2]
_digits = re.compile('\d')
if bool(_digits.search(test1)):
print 'hi'
test1=test1[:2]+'. '+test1[3:]
item['mrp'] = test1
#item['mrp'][2:2]='.'
test2 = str(sel.xpath('div/div[2]/div[3]/span[2]/text()').extract())[3:-2]
test2=test2[:2]+'. '+test2[3:]
item['current_price']=test2
else:
item['mrp'] = item['current_price']
except:
item['mrp'] = item['current_price']
item['offer'] = 'No additional offer available'
item['imageurl'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="image"]/a[not (#class="sft_quickshop_icon")]/img[#class="img-responsive"]/#src').extract())[3:-2]
item['outofstock_status'] = str('In Stock')
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
# print item
items.append(item)
return request
print (items)
def parse2(self, response):
item = response.meta['item']
item['add'] = response.url
return item
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
#settings.set( "DEPTH_PRIORITY" , 1)
#settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
#settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()
Now, these are the issues that I am facing.
1. There are numerous divs that can be found with this xpath - '//html/body/div/div/div[4]/div/div/div[5]/div' . However, the above code scrapes the contents only of the first div , i.e , having the xpath 'html/body/div/div/div[4]/div/div/div[5]/div[1]' , and not all of them.
The moment I comment these three lines, the scraper scrapes everything, obviously then I am not able to add the 'add' field in the item-:
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
return request
So, I want to scrape all the divs , in addition with the 'add' field in my item Class (notice the class DmozItem). How do I do that? Please give a corrected code for my SPECIFIC case, it would be best that way!
2. Secondly, as I said, as I comment the three lines, that I mentioned above, then the program scrapes everything in a time close to 5 seconds (around 4.9 seconds).
But as soon as I un-comment, those 3 lines (again those that I mentioned above), the program's run-time exceeds drastically, and it runs in a time close to 9 seconds (around 8.8 - 8.9 seconds). Why does this happen? Is that because of this - dont_filter=True? Please suggest ways to overcome this, as the run-time can prove to be a very big problem for me. Also, can I decrease the initial time of 5 seconds (around 4.9) somehow?
Use html/body/div/div/div[4]/div/div/div[5]//div to get all divs after div[5].
EDIT:
This is the correct xpath - //html/body/div/div/div[4]/div/div/div[5]/div, that gave all the div's after div[5]. The previous one mentioned, gave multiple errors!
If you do a return statement inside the loop you end the whole method execution. So if you enable those three lines you end the execution of your method (and the for loop) after the first element.
This means you should yield your request instead of returning it.

Scrapy multiple search terms

I am very new to Python and I am in the process of learning on how scrape web pages (1 day in). The task I want to achieve is to loop through a list of 2000 companies and extract revenue data and the number of employees. I started by using scrapy, and I have managed to get the workflow to work for one company (not elegant, but at least I am trying)- but I cannot figure out how I can load the list of companies and loop through to carry out multiple searches. I have a feeling this is a fairly simple procedure.
So, my main question is - where in the spider class should I define the query array of companies to loop through? I do not know the exact URLs since each company has a unique ID and belongs to specific market - so I can not input them as start_urls.
Is Scrapy the right tool or should I have used mechanize - for this type of task?
Here is my current code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["proff.se"]
start_urls = ["http://www.proff.se"]
# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.
def parse(self, response):
return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)
# I fetch the url from the search result and convert it to correct Financial-url where the information is located.
def search_result(self,response):
sel = HtmlXPathSelector(response)
link = sel.xpath('//ul[#class="company-list two-columns"]/li/a/#href').extract()
finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
return Request(finance_url,callback=self.parse_finance)
# I Scraped the information of this particular company, this is hardcoded and will not
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[#id="accountTable1"]/tbody/tr[3]/#data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.
def parse_finance(self, response):
sel = HtmlXPathSelector(response)
datachart = sel.xpath('//tr/#data-chart').extract()
employees=json.loads(datachart[36])
revenue = json.loads(datachart[0])
items = []
item = DmozItem()
item['company']=response.url.split("/")[-5]
item['market']=response.url.split("/")[-3]
item['employees']=employees
item['revenue']=revenue
items.append(item)
return item
The common approach is to do this with a command-line argument. Give the spider's __init__ method an argument:
class ProffSpider(BaseSpider):
name = "proff"
...
def __init__(self, query):
self.query = query
def parse(self, response):
return FormRequest.from_response(response,
formdata={'q': self.query},
callback=self.search_result
)
...
And then start your spiders (maybe with Scrapyd):
$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"
If you want to run a bunch of spiders at once by passing in the arguments from a file, you can create a new command to run multiple instances of a spider. This is just mixing the builtin crawl command with the example code for running multiple spiders with a single crawler:
your_project/settings.py
COMMANDS_MODULE = 'your_project_module.commands'
your_project/commands/__init__.py
# empty file
your_project/commands/crawl_many.py
import os
import csv
from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Run many instances of a spider'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
if not opts.output:
return
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')
if opts.output_format not in valid_output_formats:
raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def run(self, args, opts):
if args:
raise UsageError()
with open(opts.input_file, 'rb') as handle:
for spider_options in csv.DictReader(handle):
spider = spider_options.pop('spider')
self.crawler_process.crawl(spider, **spider_options)
self.crawler_process.start()
You can run it like so:
$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl
The format of the crawl options CSV is simple:
spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd
The first thing I'd do is to create a list of companies and find a way to get the url of each one. After this crawling is easy. I have written a crawler to extract disease information from wikipedia from a list of diseases. See how it fits your use case.
import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()
def crawlwiki(keywords):
print (keywords)
columns = ['Category', 'Text']
page=1
print ('Fetching for {}....'.format(keywords))
url = 'https://en.wikipedia.org/wiki/'
for i in range(len(keywords)):
url = url + keywords[i]
url = url + '%20'
url = url[0:(len(url)-3)]
output_obj = {}
#curr_page = url+str(page)
while True:
try:
page_source = requests.get(url)
except:
#What you should do if internet connection fails
break
plain_text = page_source.text
bs_obj = BeautifulSoup(plain_text, "lxml")
'''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
base_url = 'http://www.webmd.com'
for div in toc_links:
links = div.findAll('a')
for a in links:
output_obj[a.text] = base_url + a.get('href')
print (base_url + a.get('href'))
data = bs_obj.findAll('div', {'class':'search-text-container'})
for div in data:
links = div.findAll('a')
for a in links:
output_obj[a.text] = a.get('href')
print (a.get('href'))'''
"""
Mapping:
1 : Signs and symptoms
2 : Diagnosis
3 : Prognosis
4 : Treatment
"""
symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(symptom_text)
symptoms_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
symptom_data = symptoms_object.findAll('p')
symptom_paragraphs = ""
for p in symptom_data:
symptom_paragraphs += p.text
symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)
diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(diagnosis_text)
diagnosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
diagnosis_data = diagnosis_object.findAll('p')
diagnosis_paragraphs = ""
for p in diagnosis_data:
diagnosis_paragraphs += p.text
diagnosis_paragraphs = re.sub(r"/?\[\d+]" , '', diagnosis_paragraphs, re.DOTALL)
df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)
prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(prognosis_text)
prognosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
prognosis_data = prognosis_object.findAll('p')
prognosis_paragraphs = ""
for p in prognosis_data:
prognosis_paragraphs += p.text
prognosis_paragraphs = re.sub(r"/?\[\d+]" , '', prognosis_paragraphs, re.DOTALL)
df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)
treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(treatment_text)
treatment_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
treatment_data = treatment_object.findAll('p')
treatment_paragraphs = ""
for p in treatment_data:
treatment_paragraphs += p.text
treatment_paragraphs = re.sub(r"/?\[\d+]" , '', treatment_paragraphs, re.DOTALL)
df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)
df = pd.DataFrame(columns = columns)
df = df.append(df_1.append(df_2.append(df_3.append(df_4))))
return df
print('Fetch completed....')
def main():
disease_df = pd.read_csv("disease.txt", sep="\n", header=None)
columns = ['Category', 'Text']
df_data = pd.DataFrame(columns=columns)
size = disease_df.size
print("Initializing....")
p = Pool(5)
df_data = p.map(crawlwiki, disease_df.values.tolist())
"""for index, row in disease_df.iterrows():
print('Iteration {0} out of {1}.....'.format(index+1, size))
df = crawlwiki(row, columns)
df_data = df_data.append(df)"""
df_data.to_csv("TagDataset.csv", index=False)
if __name__ == '__main__':
main()

Categories

Resources