Scrapy crawler does not scrape or print results in CSV

Scrapy crawler does not scrape or print results in CSV - python

Seems like this scrapy spider locates the links that it is supposed to go to in order to collect additional information, but it either doesn't go to the next page or it is unable to collect the information on the other page. I checked the xpath links, they all appear to be correct.
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`

Obviously this part of code
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
doesn't emit any link. Perhaps you meant if 'Feedback' not in name
Also note that XPath here digest.xpath('//a[#target="_blank"]/text()') applied to DOM overall, not only part previously extracted for digest. If you'd like to apply XPath to digest selector you should rather use leading dot in xpath like this digest.xpath('.//a[#target="_blank"]/text()')

Related

recursive web crawling in python

this is my code :
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
I want to that continue in others pages. mean its continue counting until there isn't any link.
When I calling the check function, thats not worked !
import requests
from bs4 import BeautifulSoup
import re
class WebCrawler():
def check(self, links):
global imgCount
for item in links:
targetURL = item['href']
if(targetURL.startswith('/')):
targetURL = target + targetURL # add http:// and hostname to url
target_html = requests.get(targetURL)
parsed_html = BeautifulSoup(target_html.text, 'html.parser')
if parsed_html.title.text not in pages:
pages.append(parsed_html.title.text)
print "[+] Collecting images page : " + parsed_html.title.text
images = parsed_html.findAll('img', {'src': re.compile(r'(jpe?g)|(png)|(svg)$')})
for img_url in images:
imgCount=imgCount + 1
# print img_url['src'] + ':::::::::' + img_url.get('alt', "") + "\n"
lnks = parsed_html.find_all('a')
self.check(lnks)
pages = []
imgCount = 0
target = raw_input("Please enter base url: ")
data = BeautifulSoup(requests.get(target).text, 'html.parser')
link = data.find_all('a')
crawler = WebCrawler()
crawler.check(link)
print "===================== Total Collected Images =====================\n"
print imgCount
I added these lines to it :
lnks = parsed_html.find_all('a')
self.check(lnks)
this time, the loop executed only one time !

Try something like this:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["sfbay.craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/search/npo"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.xpath("a/text()").extract()
item["link"] = titles.xpath("a/#href").extract()
items.append(item)
return(items)

Stuck with Data Crawling on Scrapy

One of my friend was developing a scrapy script to scrap data from a page.
After sometime, I needed to add another field into. And I added the field successfully. But the problem is the field is not getting the data of the links inside the td. The field name is "Last Batsman"
Data URL:
http://digicricket.marssil.com/match/MatchData.aspx?op=1&match=1385
XPath of the Data:
//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[6]/tr/td
import scrapy
from bs4 import BeautifulSoup
from scrapy.exceptions import CloseSpider
from scrapy.selector import Selector
from digicricket.items import ODIorTestItem
class DigicricketMarsilOp1Spider(scrapy.Spider):
name = "digicricket.marssil.op1"
allowed_domains = ["digicricket.marssil.com"]
def __init__(self, match_id=None):
if match_id:
match_id_list = match_id.split(',')
for i in match_id_list:
if not i.isdigit():
raise CloseSpider('Match ID = {0} is not a number'.format(i))
else:
self.start_urls = ['http://digicricket.marssil.com/match/MatchData.aspx?op=1&match={0}'.format(i)
for i in match_id_list]
else:
raise CloseSpider('You forgot input Match ID/IDs')
def parse(self, response):
item = ODIorTestItem()
item['Batsman_op1'] = []
item['Bowler_op1'] = []
item['other_op1'] = []
sel = Selector(response)
tables = sel.xpath('//div[#id="ctl00_ContentPlaceHolder1_divData"]/table').extract()
row_for_other = dict()
for i in xrange(len(tables)):
html_text = BeautifulSoup(tables[i])
if i == 1:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Batsman"] = td[0].get_text()
row["R"] = td[1].get_text()
row["B"] = td[2].get_text()
row["4s"] = td[3].get_text()
row["6s"] = td[4].get_text()
row["SR"] = td[5].get_text()
item['Batsman_op1'].append(row)
elif i == 2:
sl = 0
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if td:
sl += 1
row = dict()
row['sl'] = sl
row['match_id'] = response.url[response.url.rfind('=')+1:]
row["Bowler"] = td[0].get_text()
row["O"] = td[1].get_text()
row["M"] = td[2].get_text()
row["R"] = td[3].get_text()
row["W"] = td[4].get_text()
row["Econ"] = td[5].get_text()
item['Bowler_op1'].append(row)
else:
for tr in html_text.find_all('tr'):
td = tr.find_all('td')
if i == 0:
try:
row_for_other["InningsMatchDetails"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[1]').extract()[0]
except:
row_for_other["InningsMatchDetails"] = None
try:
row_for_other["CurrentScore"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/span/text()').extract()[0]
except:
row_for_other["CurrentScore"] = None
try:
row_for_other["OversRunRate"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[2]').extract()[0]
except:
row_for_other["OversRunRate"] = None
try:
row_for_other["Extras"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/table[1]/'
'tr/td/b/text()[3]').extract()[0]
except:
row_for_other["Extras"] = None
try:
row_for_other["MatchResult"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[1]/tr/td/b/text()[4]').extract()[0]
except:
row_for_other["MatchResult"] = None
try:
row_for_other["RecentOvers"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[4]/tr/td[2]/text()').extract()[0]
except:
row_for_other["RecentOvers"] = None
try:
row_for_other["LastBatsman"] = sel.xpath('//*[#id="ctl00_ContentPlaceHolder1_divData"]/'
'table[6]/tr/td/text()').extract()[0]
except:
row_for_other["LastBatsman"] = None
row_for_other['match_id'] = response.url[response.url.rfind('=')+1:]
item['other_op1'].append(row_for_other)
return item

Your XPath seems to miss some tags. On the web page there are two div levels before the second table. Replacing / with // takes care of these. (Because my browser added some <tbody> tags there is also a double slash in front of the tr.
.//*[#id="ctl00_ContentPlaceHolder1_divData"]//table[6]//tr/td/a[1]/text()

Scrapy (Python): Iterating over 'next' page without multiple functions

I am using Scrapy to grab stock data from Yahoo! Finance.
Sometimes, I need to loop over several pages, 19 in this example , in order to get all of the stock data.
Previously (when I knew there would only be two pages), I would use one function for each page, like so:
def stocks_page_1(self, response):
returns_page1 = []
#Grabs data here...
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks_page_2, meta={'returns_page1': returns_page1})
def stocks_page_2(self, response):
# Grab data again...
Now, instead of writing 19 or more functions, I was wondering if there was a way I could loop through an iteration using one function to grab all data from all pages available for a given stock.
Something like this:
for x in range(30): # 30 was randomly selected
current_page = response.url
# Grabs Data
# Check if there is a 'next' page:
if response.xpath('//td[#align="right"]/a[#rel="next"]').extract() != ' ':
u = x * 66
next_page = current_page + "&z=66&y={0}".format(u)
# Go to the next page somehow within the function???
Updated Code:
Works, but only returns one page of data.
class DmozSpider(CrawlSpider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
rules = [
Rule(LinkExtractor(restrict_xpaths='//td[#align="right"]/a[#rel="next"]'),
callback='stocks1',
follow=True),
]
def stocks1(self, response):
returns = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
returns.append(values)
except ValueError:
continue
except ValueError:
continue
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[#class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item

You see, a parse callback is just a function that takes the response and returns or yields either Items or Requests or both. There is no issue at all with reusing these callbacks, so you can just pass the same callback for every request.
Now, you could pass the current page info using the Request meta but instead, I'd leverage the CrawlSpider to crawl across every page. It's really easy, start generating the Spider with the command line:
scrapy genspider --template crawl finance finance.yahoo.com
Then write it like this:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Scrapy 1.0 has deprecated the scrapy.contrib namespace for the modules above, but if you're stuck with 0.24, use scrapy.contrib.linkextractors and scrapy.contrib.spiders.
from yfinance.items import YfinanceItem
class FinanceSpider(CrawlSpider):
name = 'finance'
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=PWF.TO&a=04&b=19&c=2005&d=04&e=19&f=2010&g=d&z=66&y=132']
rules = (
Rule(LinkExtractor(restrict_css='[rel="next"]'),
callback='parse_items',
follow=True),
)
LinkExtractor will pick up the links in the response to follow, but it can be limited with XPath (or CSS) and regular expressions. See documentation for more.
Rules will follow the links and call the callback on every response. follow=True will keep extracting links on every new response, but it can be limited by depth. See documentation again.
def parse_items(self, response):
for line in response.css('.yfnc_datamodoutline1 table tr')[1:-1]:
yield YfinanceItem(date=line.css('td:first-child::text').extract()[0])
Just yield the Items, since Requests for the next pages will be handled by the CrawlSpider Rules.

Problems with passing global variables in a Python scrapy project

In a Scrapy project I am doing, I am having difficulties in sending a variable containing a list from one function to another. I need to do so, as I need to combine the values from one page along with another at the end of the script. The code is as follows:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
from six import string_types
from datetime import datetime
from decimal import Decimal
import itertools
import numpy
import urlparse
import scrapy
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["ca.finance.yahoo.com", "http://eoddata.com/"]
start_urls = [
"http://eoddata.com/stocklist/TSX.htm"
]
def parse(self,response):
companyList = response.xpath('//tr[#class="ro"]/td/a/text()').extract()
for company in companyList:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m'.format(company)
for link in go:
yield Request(go, self.stocks1)
def stocks1(self, response):
# global returns_page1
# EAFP = Easier to ask for forgiveness then permission
# Gathers ONLY adjusted closing stock price
global returns_page1
returns_page1 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page1.append(values)
except ValueError:
continue
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks2)
def stocks2(self, response):
item = Website()
global returns_page1
returns_page2 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page2.append(values)
except ValueError:
continue
returns_tot = returns_page1 + returns_page2
returns_dec = [Decimal(float(i)) for i in returns_tot]
returns = [float(n) for n in returns_dec]
items = []
item = Website()
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avgreturns'] = numpy.mean(returns)
item['varreturns'] = numpy.var(returns)
item['sdreturns'] = numpy.std(returns)
item['returns'] = returns
items.append(item)
yield item
I am trying to combine returns_page1 from the def stocks1 function with returns_page2 that is gathered in the def stocks2 function. However my output is only giving me the values from the returns_page2 variable.
I know I can't put a return in the def stocks1 function because I have a yield in there. That's why I tried using global variables.
What am I doing wrong here?

Best way of passing values from one function to another is using meta in request,
in first function
yield Request(next_page, self.stocks2, meta={'returns_page1': returns_page1})
in second function
returns_page1 = response.meta.get('returns_page1')

Scrapy multiple search terms

I am very new to Python and I am in the process of learning on how scrape web pages (1 day in). The task I want to achieve is to loop through a list of 2000 companies and extract revenue data and the number of employees. I started by using scrapy, and I have managed to get the workflow to work for one company (not elegant, but at least I am trying)- but I cannot figure out how I can load the list of companies and loop through to carry out multiple searches. I have a feeling this is a fairly simple procedure.
So, my main question is - where in the spider class should I define the query array of companies to loop through? I do not know the exact URLs since each company has a unique ID and belongs to specific market - so I can not input them as start_urls.
Is Scrapy the right tool or should I have used mechanize - for this type of task?
Here is my current code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["proff.se"]
start_urls = ["http://www.proff.se"]
# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.
def parse(self, response):
return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)
# I fetch the url from the search result and convert it to correct Financial-url where the information is located.
def search_result(self,response):
sel = HtmlXPathSelector(response)
link = sel.xpath('//ul[#class="company-list two-columns"]/li/a/#href').extract()
finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
return Request(finance_url,callback=self.parse_finance)
# I Scraped the information of this particular company, this is hardcoded and will not
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[#id="accountTable1"]/tbody/tr[3]/#data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.
def parse_finance(self, response):
sel = HtmlXPathSelector(response)
datachart = sel.xpath('//tr/#data-chart').extract()
employees=json.loads(datachart[36])
revenue = json.loads(datachart[0])
items = []
item = DmozItem()
item['company']=response.url.split("/")[-5]
item['market']=response.url.split("/")[-3]
item['employees']=employees
item['revenue']=revenue
items.append(item)
return item

The common approach is to do this with a command-line argument. Give the spider's __init__ method an argument:
class ProffSpider(BaseSpider):
name = "proff"
...
def __init__(self, query):
self.query = query
def parse(self, response):
return FormRequest.from_response(response,
formdata={'q': self.query},
callback=self.search_result
)
...
And then start your spiders (maybe with Scrapyd):
$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"
If you want to run a bunch of spiders at once by passing in the arguments from a file, you can create a new command to run multiple instances of a spider. This is just mixing the builtin crawl command with the example code for running multiple spiders with a single crawler:
your_project/settings.py
COMMANDS_MODULE = 'your_project_module.commands'
your_project/commands/__init__.py
# empty file
your_project/commands/crawl_many.py
import os
import csv
from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Run many instances of a spider'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
if not opts.output:
return
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')
if opts.output_format not in valid_output_formats:
raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def run(self, args, opts):
if args:
raise UsageError()
with open(opts.input_file, 'rb') as handle:
for spider_options in csv.DictReader(handle):
spider = spider_options.pop('spider')
self.crawler_process.crawl(spider, **spider_options)
self.crawler_process.start()
You can run it like so:
$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl
The format of the crawl options CSV is simple:
spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd

The first thing I'd do is to create a list of companies and find a way to get the url of each one. After this crawling is easy. I have written a crawler to extract disease information from wikipedia from a list of diseases. See how it fits your use case.
import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()
def crawlwiki(keywords):
print (keywords)
columns = ['Category', 'Text']
page=1
print ('Fetching for {}....'.format(keywords))
url = 'https://en.wikipedia.org/wiki/'
for i in range(len(keywords)):
url = url + keywords[i]
url = url + '%20'
url = url[0:(len(url)-3)]
output_obj = {}
#curr_page = url+str(page)
while True:
try:
page_source = requests.get(url)
except:
#What you should do if internet connection fails
break
plain_text = page_source.text
bs_obj = BeautifulSoup(plain_text, "lxml")
'''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
base_url = 'http://www.webmd.com'
for div in toc_links:
links = div.findAll('a')
for a in links:
output_obj[a.text] = base_url + a.get('href')
print (base_url + a.get('href'))
data = bs_obj.findAll('div', {'class':'search-text-container'})
for div in data:
links = div.findAll('a')
for a in links:
output_obj[a.text] = a.get('href')
print (a.get('href'))'''
"""
Mapping:
1 : Signs and symptoms
2 : Diagnosis
3 : Prognosis
4 : Treatment
"""
symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(symptom_text)
symptoms_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
symptom_data = symptoms_object.findAll('p')
symptom_paragraphs = ""
for p in symptom_data:
symptom_paragraphs += p.text
symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)
diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(diagnosis_text)
diagnosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
diagnosis_data = diagnosis_object.findAll('p')
diagnosis_paragraphs = ""
for p in diagnosis_data:
diagnosis_paragraphs += p.text
diagnosis_paragraphs = re.sub(r"/?\[\d+]" , '', diagnosis_paragraphs, re.DOTALL)
df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)
prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(prognosis_text)
prognosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
prognosis_data = prognosis_object.findAll('p')
prognosis_paragraphs = ""
for p in prognosis_data:
prognosis_paragraphs += p.text
prognosis_paragraphs = re.sub(r"/?\[\d+]" , '', prognosis_paragraphs, re.DOTALL)
df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)
treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(treatment_text)
treatment_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
treatment_data = treatment_object.findAll('p')
treatment_paragraphs = ""
for p in treatment_data:
treatment_paragraphs += p.text
treatment_paragraphs = re.sub(r"/?\[\d+]" , '', treatment_paragraphs, re.DOTALL)
df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)
df = pd.DataFrame(columns = columns)
df = df.append(df_1.append(df_2.append(df_3.append(df_4))))
return df
print('Fetch completed....')
def main():
disease_df = pd.read_csv("disease.txt", sep="\n", header=None)
columns = ['Category', 'Text']
df_data = pd.DataFrame(columns=columns)
size = disease_df.size
print("Initializing....")
p = Pool(5)
df_data = p.map(crawlwiki, disease_df.values.tolist())
"""for index, row in disease_df.iterrows():
print('Iteration {0} out of {1}.....'.format(index+1, size))
df = crawlwiki(row, columns)
df_data = df_data.append(df)"""
df_data.to_csv("TagDataset.csv", index=False)
if __name__ == '__main__':
main()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy crawler does not scrape or print results in CSV - python

Related

recursive web crawling in python

Stuck with Data Crawling on Scrapy

Scrapy (Python): Iterating over 'next' page without multiple functions

Problems with passing global variables in a Python scrapy project

Scrapy multiple search terms

Categories

Resources