I am very new to Python and I am in the process of learning on how scrape web pages (1 day in). The task I want to achieve is to loop through a list of 2000 companies and extract revenue data and the number of employees. I started by using scrapy, and I have managed to get the workflow to work for one company (not elegant, but at least I am trying)- but I cannot figure out how I can load the list of companies and loop through to carry out multiple searches. I have a feeling this is a fairly simple procedure.
So, my main question is - where in the spider class should I define the query array of companies to loop through? I do not know the exact URLs since each company has a unique ID and belongs to specific market - so I can not input them as start_urls.
Is Scrapy the right tool or should I have used mechanize - for this type of task?
Here is my current code.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import FormRequest
from scrapy.http import Request
from tutorial.items import DmozItem
import json
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["proff.se"]
start_urls = ["http://www.proff.se"]
# Search on the website, currently I have just put in a static search term here, but I would like to loop over a list of companies.
def parse(self, response):
return FormRequest.from_response(response, formdata={'q': rebtel},callback=self.search_result)
# I fetch the url from the search result and convert it to correct Financial-url where the information is located.
def search_result(self,response):
sel = HtmlXPathSelector(response)
link = sel.xpath('//ul[#class="company-list two-columns"]/li/a/#href').extract()
finance_url=str(link[0]).replace("/foretag","http://www.proff.se/nyckeltal")
return Request(finance_url,callback=self.parse_finance)
# I Scraped the information of this particular company, this is hardcoded and will not
# work for other responses. I had some issues with the encoding characters
# initially since they were Swedish. I also tried to target the Json element direct by
# revenue = sel.xpath('#//*[#id="accountTable1"]/tbody/tr[3]/#data-chart').extract()
# but was not able to parse it (error - expected string or buffer - tried to convert it
# to a string by str() with no luck, something off with the formatting, which is messing the the data types.
def parse_finance(self, response):
sel = HtmlXPathSelector(response)
datachart = sel.xpath('//tr/#data-chart').extract()
employees=json.loads(datachart[36])
revenue = json.loads(datachart[0])
items = []
item = DmozItem()
item['company']=response.url.split("/")[-5]
item['market']=response.url.split("/")[-3]
item['employees']=employees
item['revenue']=revenue
items.append(item)
return item
The common approach is to do this with a command-line argument. Give the spider's __init__ method an argument:
class ProffSpider(BaseSpider):
name = "proff"
...
def __init__(self, query):
self.query = query
def parse(self, response):
return FormRequest.from_response(response,
formdata={'q': self.query},
callback=self.search_result
)
...
And then start your spiders (maybe with Scrapyd):
$ scrapy crawl proff -a query="something"
$ scrapy crawl proff -a query="something else"
If you want to run a bunch of spiders at once by passing in the arguments from a file, you can create a new command to run multiple instances of a spider. This is just mixing the builtin crawl command with the example code for running multiple spiders with a single crawler:
your_project/settings.py
COMMANDS_MODULE = 'your_project_module.commands'
your_project/commands/__init__.py
# empty file
your_project/commands/crawl_many.py
import os
import csv
from scrapy.commands import ScrapyCommand
from scrapy.utils.python import without_none_values
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Run many instances of a spider'
def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-f', '--input-file', metavar='FILE', help='CSV file to load arguments from')
parser.add_option('-o', '--output', metavar='FILE', help='dump scraped items into FILE (use - for stdout)')
parser.add_option('-t', '--output-format', metavar='FORMAT', help='format to use for dumping items with -o')
def process_options(self, args, opts):
ScrapyCommand.process_options(self, args, opts)
if not opts.output:
return
if opts.output == '-':
self.settings.set('FEED_URI', 'stdout:', priority='cmdline')
else:
self.settings.set('FEED_URI', opts.output, priority='cmdline')
feed_exporters = without_none_values(self.settings.getwithbase('FEED_EXPORTERS'))
valid_output_formats = feed_exporters.keys()
if not opts.output_format:
opts.output_format = os.path.splitext(opts.output)[1].replace('.', '')
if opts.output_format not in valid_output_formats:
raise UsageError('Unrecognized output format "%s". Valid formats are: %s' % (opts.output_format, tuple(valid_output_formats)))
self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def run(self, args, opts):
if args:
raise UsageError()
with open(opts.input_file, 'rb') as handle:
for spider_options in csv.DictReader(handle):
spider = spider_options.pop('spider')
self.crawler_process.crawl(spider, **spider_options)
self.crawler_process.start()
You can run it like so:
$ scrapy crawl_many -f crawl_options.csv -o output_file.jsonl
The format of the crawl options CSV is simple:
spider,query,arg2,arg3
proff,query1,value2,value3
proff,query2,foo,bar
proff,query3,baz,asd
The first thing I'd do is to create a list of companies and find a way to get the url of each one. After this crawling is easy. I have written a crawler to extract disease information from wikipedia from a list of diseases. See how it fits your use case.
import requests
from bs4 import BeautifulSoup
import sys
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from subprocess import Popen, check_call
from multiprocessing import Pool
#nltk.download()
def crawlwiki(keywords):
print (keywords)
columns = ['Category', 'Text']
page=1
print ('Fetching for {}....'.format(keywords))
url = 'https://en.wikipedia.org/wiki/'
for i in range(len(keywords)):
url = url + keywords[i]
url = url + '%20'
url = url[0:(len(url)-3)]
output_obj = {}
#curr_page = url+str(page)
while True:
try:
page_source = requests.get(url)
except:
#What you should do if internet connection fails
break
plain_text = page_source.text
bs_obj = BeautifulSoup(plain_text, "lxml")
'''toc_links = bs_obj.findAll('div', {'class': 'toc-links'})
base_url = 'http://www.webmd.com'
for div in toc_links:
links = div.findAll('a')
for a in links:
output_obj[a.text] = base_url + a.get('href')
print (base_url + a.get('href'))
data = bs_obj.findAll('div', {'class':'search-text-container'})
for div in data:
links = div.findAll('a')
for a in links:
output_obj[a.text] = a.get('href')
print (a.get('href'))'''
"""
Mapping:
1 : Signs and symptoms
2 : Diagnosis
3 : Prognosis
4 : Treatment
"""
symptom_text = re.findall ( '<h2><span class="mw-headline" id="Signs_and_symptoms">Signs and symptoms</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(symptom_text)
symptoms_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
symptom_data = symptoms_object.findAll('p')
symptom_paragraphs = ""
for p in symptom_data:
symptom_paragraphs += p.text
symptom_paragraphs = re.sub(r"/?\[\d+]" , '', symptom_paragraphs, re.DOTALL)
df_1 = pd.DataFrame(data=[['1', symptom_paragraphs]], columns=columns)
diagnosis_text = re.findall ( '<h2><span class="mw-headline" id="Diagnosis">Diagnosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(diagnosis_text)
diagnosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
diagnosis_data = diagnosis_object.findAll('p')
diagnosis_paragraphs = ""
for p in diagnosis_data:
diagnosis_paragraphs += p.text
diagnosis_paragraphs = re.sub(r"/?\[\d+]" , '', diagnosis_paragraphs, re.DOTALL)
df_2 = pd.DataFrame(data=[['2', diagnosis_paragraphs]], columns=columns)
prognosis_text = re.findall ( '<h2><span class="mw-headline" id="Prognosis">Prognosis</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(prognosis_text)
prognosis_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
prognosis_data = prognosis_object.findAll('p')
prognosis_paragraphs = ""
for p in prognosis_data:
prognosis_paragraphs += p.text
prognosis_paragraphs = re.sub(r"/?\[\d+]" , '', prognosis_paragraphs, re.DOTALL)
df_3 = pd.DataFrame(data=[['3', prognosis_paragraphs]], columns=columns)
treatment_text = re.findall ( '<h2><span class="mw-headline" id="Treatment">Treatment</span>(.*?)<h2>', plain_text, re.DOTALL)
str1 = ''.join(treatment_text)
treatment_object = BeautifulSoup(str1, "lxml")
#paragraphs = re.findall('<p>(.*?)<p>', str1, re.DOTALL)
treatment_data = treatment_object.findAll('p')
treatment_paragraphs = ""
for p in treatment_data:
treatment_paragraphs += p.text
treatment_paragraphs = re.sub(r"/?\[\d+]" , '', treatment_paragraphs, re.DOTALL)
df_4 = pd.DataFrame(data=[['4', treatment_paragraphs]], columns=columns)
df = pd.DataFrame(columns = columns)
df = df.append(df_1.append(df_2.append(df_3.append(df_4))))
return df
print('Fetch completed....')
def main():
disease_df = pd.read_csv("disease.txt", sep="\n", header=None)
columns = ['Category', 'Text']
df_data = pd.DataFrame(columns=columns)
size = disease_df.size
print("Initializing....")
p = Pool(5)
df_data = p.map(crawlwiki, disease_df.values.tolist())
"""for index, row in disease_df.iterrows():
print('Iteration {0} out of {1}.....'.format(index+1, size))
df = crawlwiki(row, columns)
df_data = df_data.append(df)"""
df_data.to_csv("TagDataset.csv", index=False)
if __name__ == '__main__':
main()
Related
I'm attempting to record the information from the job site Indeed.com for all jobs resulting from a specific search term.
However, although when I search that term it says there are X jobs available (i.e., "showing Page 1 of X jobs"), the number of unique jobs (I remove I record is much less. The number is also not consistent if I repeat the scraper and there are duplicates.
This makes me wonder if there is some shuffling of the contents (think sampling with replacement) so that there are unique jobs that I don't see.
Alternative could be that the number of jobs isn't correctly shown on the site. For example, if you go to the last page it's showing only approximately 620 jobs of the alleged 920. But this doesn't explain why I'm not getting consistently the same number of unique results if I run the code twice in quick succession.
Any thoughts?
The Python3 code is here if you want to run it.
Requires: requests, bs4, pandas, numpy
# modified from https://medium.com/#msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
from html.parser import HTMLParser
from datetime import datetime
import numpy as np
import re
from itertools import chain
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
result = s.get_data()
# result = result.strip('\n')
result = result.replace('\n',' ')
return result
keyword = 'sustainability'
state = 'mi'
page_limit = 50
# init the dataframe
columns = ['job_code','keyword', 'job_title', 'company_name', 'location', 'salary']
jobs = []
div_list = []
# determine max number of results
page = requests.get('http://www.indeed.com/jobs?q={}&l={}'.format(keyword,state.lower()))
soup = BeautifulSoup(page.content, "html.parser")
max_result_bs4 = soup.find(name='div', attrs = {'id':'searchCount'})#.find('directory').text
max_results = int(max_result_bs4.contents[0].split(' ')[-2].replace(',',''))
#scraping code:
# loop through pages
for start in chain(range(0, max_results, page_limit),range(0, max_results, page_limit)):
url = 'https://www.indeed.com/jobs?q={}&start={}&limit={}&l={}&sort=date'.format(keyword, start, page_limit, state.lower())
page = requests.get(url)
time.sleep(0.01) #ensuring at least 0.01 second between page grabs
soup = BeautifulSoup(page.content, 'html.parser', from_encoding='utf-8')
# record the divs
div_list += soup.find_all(name='div', attrs={'data-tn-component': 'organicJob'})
# format the scrapes
for div in div_list:
#creating an empty list to hold the data for each posting
job_post = []
# get the job code
job_code = div['data-jk']
job_post.append(job_code)
#append keyword name
job_post.append(keyword)
#grabbing job title
for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
title = a['title']
if title:
job_post.append(title)
else:
job_post.append('Not found')
#grabbing company name
company = div.find_all(name='span', attrs={'class':'company'})
if len(company) > 0:
for b in company:
job_post.append(b.text.strip())
else:
sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
for span in sec_try:
job_post.append(span.text)
if len(job_post) == 3:
job_post.append('Not found')
#grabbing location name
job_post.append(state)
#grabbing salary
try:
job_post.append(div.find('nobr').text)
except:
try:
job_post.append(div.find(name='span', attrs={'class':'salary no-wrap'}).text.strip())
except:
job_post.append('Nothing_found')
#appending list of job post info to dataframe at index num
jobs.append(job_post)
df = pd.DataFrame(jobs, columns = columns)
#saving df as a local csv file — define your own local path to save contents
todays_date = datetime.now().strftime('%Y%m%d')
df.to_csv('indeed_{}.csv'.format(todays_date), encoding='utf-8')
df_len = len(df)
unique_jobs = len(np.unique(df.job_code))
print('Found {} unique jobs from an alleged {} after recording {} postings'.format(unique_jobs, max_results, df_len))
Seems like this scrapy spider locates the links that it is supposed to go to in order to collect additional information, but it either doesn't go to the next page or it is unable to collect the information on the other page. I checked the xpath links, they all appear to be correct.
Terminal output:
2017-01-10 10:31:16 [scrapy.extensions.logstats] INFO: Crawled 213 pages (at 23 pages/min), scraped 0 items (at 0 items/min)
Code:
#!/usr/bin/env python
import types
import time
from datetime import date, datetime, timedelta
import requests
import msgpack
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector, Selector
from resume_data.items import ResumeDataItem, ResultListItem, WorkItem, SchoolItem, ItemList
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from bs4.element import NavigableString
class ResumeIndeedSpider(CrawlSpider):
name = "indeed_resume"
allowed_domains = ["indeed.com"]
start_urls = ['http://www.indeed.com/resumes/mechanical-engineer',
'http://www.indeed.com/resumes/mechanical-engineering',
'http://www.indeed.com/resumes/piping-engineer',
'http://www.indeed.com/resumes/design-engineer',
'http://www.indeed.com/resumes/project-engineer']
#def __init__(self, filename=None):
#self.unis = list()
rules = (Rule (SgmlLinkExtractor(restrict_xpaths = ('//a[contains(#class,"app_link")]')), callback = "parse_item", follow = True),)
def parse_item(self, response):
hxs = Selector(response)
digest = hxs.xpath('//ol[#class="resultsList"]')
records = ResumeDataItem()
url_prefix = 'http://www.indeed.com'
resume_links = digest.xpath('//li[#class="sre"]//div[#class="sre-entry"]')
names = digest.xpath('//a[#target="_blank"]/text()').extract()
links = digest.xpath('//a[#target="_blank"]/#href').extract()
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
def parse_node(self, response):
hxs = Selector(response)
records = ResumeDataItem()
# name = hxs.xpath('/text()').extract()
name = hxs.xpath('//h1[#id="resume-contact"]/text()').extract()
headline = hxs.xpath('//h2[#id="headline"]/text()').extract()
# locale = hxs.xpath('//div[#class="addr" and #itemprop="address"]//p//text()').extract()
rlocale = hxs.xpath('//p[#id="headline_location" and #class="locality"]//text()').extract()
summary = hxs.xpath('//p[#id="res_summary" and #class="summary"]/text()').extract()
skills = list()
skill = hxs.xpath('//div[#id="skills-items" and #class="items-container"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
skill = hxs.xpath('//div[#id="additionalinfo-section" and #class="last"]//div[#class="data_display"]//p//text()').extract()
if len(skill) != 0:
skills.append(''.join(skill).encode('utf-8'))
resume_links = list()
links = hxs.xpath('//div[#id="link-items" and #class="items-container"]//p//text()').extract()
for link in links:
resume_links.append(''.join(link).encode('utf-8'))
workHistory = ItemList()
experience = hxs.xpath('//div[#id="work-experience-items"]/div')
for elem in experience:
item = elem.xpath('div')
for entry in item:
workEntry = WorkItem()
title = entry.xpath('p[#class="work_title title"]//text()').extract()
workEntry['title'] = ''.join(title).encode('utf-8')
company = entry.xpath('div[#class="work_company"]/span/text()').extract()
workEntry['company']= ''.join(company).encode('utf-8')
location = entry.xpath('div[#class="work_company"]/div[#class="inline-block"]/span/text()').extract()
workEntry['work_location'] = ''.join(company).encode('utf-8')
dates = entry.xpath('p[#class="work_dates"]//text()').extract()
dates_str = ''.join(dates).encode('utf-8').split(' to ')
if len(dates) > 0:
if dates_str[0]:
workEntry['start_date'] = dates_str[0]
if dates_str[1]:
workEntry['end_date'] = dates_str[1]
else:
workEntry['start_date'] = 'NULL'
workEntry['end_date'] = 'NULL'
description = entry.xpath('p[#class="work_description"]//text()').extract()
workEntry['description'] = ''.join(description).encode('utf-8')
workHistory.container.append(workEntry)
eduHistory = ItemList()
education = hxs.xpath('//div[#id="education-items" and #class="items-container"]/div')
for elem in education:
item = elem.xpath('div')
for entry in item:
eduEntry = SchoolItem()
degree = entry.xpath('p[#class="edu_title"]/text()').extract()
degree = ''.join(degree).encode('utf-8')
eduEntry['degree'] = degree
school = entry.xpath('div[#class="edu_school"]/span//text()').extract()
school = ''.join(school).encode('utf-8')
eduEntry['school'] = school
locale = entry.xpath('span[#itemprop="addressLocality"]/text()').extract()
locale = ''.join(locale).encode('utf-8')
eduEntry['locale'] = locale
grad_date = entry.xpath('p[#class="edu_dates"]/text()').extract()
dates_str = ''.join(grad_date).encode('utf-8').split(' to ')
if len(grad_date) > 0:
if len(dates_str) == 2:
if dates_str[0]:
eduEntry['admit_date'] = dates_str[0]
try:
if dates_str[1]:
eduEntry['grad_date'] = dates_str[1]
except:
pass
elif len(dates_str) == 1:
if dates_str[0]:
eduEntry['grad_date'] = dates_str[0]
eduEntry['admit_date'] = 'NULL'
else:
eduEntry['admit_date'] = 'NULL'
eduEntry['grad_date'] = 'NULL'
eduHistory.container.append(eduEntry)
records['url'] = response.url
records['name'] = ''.join(name).encode('utf-8')
records['headline'] = msgpack.packb(''.join(headline).encode('utf-8'))
records['locale'] = ''.join(rlocale).encode('utf-8')
records['summary'] = msgpack.packb(''.join(summary).encode('utf-8'))
records['skills'] = msgpack.packb(skills)
records['links'] = resume_links
#records['experience'] = msgpack.packb(workHistory, default=workHistory.encode)
records['experience'] = workHistory
records['education'] = msgpack.packb(eduHistory, default=eduHistory.encode)
#records['experience'] = workHistory
#records['education'] = eduHistory
return records`
Obviously this part of code
for name, link in zip(names,links):
if name not in 'Feedback':
records['name'] = name
records['link'] = url_prefix+link
yield Request(records['link'], meta={'item': records}, callback= self.parse_node)
doesn't emit any link. Perhaps you meant if 'Feedback' not in name
Also note that XPath here digest.xpath('//a[#target="_blank"]/text()') applied to DOM overall, not only part previously extracted for digest. If you'd like to apply XPath to digest selector you should rather use leading dot in xpath like this digest.xpath('.//a[#target="_blank"]/text()')
I have the following code which scrapes a website for divs with the class "odd" or "even". I'd like to make "odd" and "even" an argument my function takes in, which would allow me to add other divs as well. Here is my code:
#
# Imports
#
import urllib2
from bs4 import BeautifulSoup
import re
import os
from pprint import pprint
#
# library
#
def get_soup(url):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", ["even", "odd"])
string_list = str([i for i in body])
return string_list
def save_to_file(path, soup):
with open(path, 'w') as fhandle:
fhandle.write(soup)
#
# script
#
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url)
save_to_file(path, the_soup)
if __name__ == '__main__':
main()
I'd like to incorporate *args into the code so the get_soup function would look like this:
def get_soup(url, *args):
page = urllib2.urlopen(url)
contents = page.read()
soup = BeautifulSoup(contents, "html.parser")
body = soup.findAll("tr", [args])
string_list = str([i for i in body])
return string_list
def main():
url = r'URL GOES HERE'
path = os.path.join('PATH GOES HERE')
the_soup = get_soup(url, "odd", "even")
save_to_file(path, the_soup)
Unfortunately, this isn't working. Ideas?
Don't put args in a list, args is already a tuple so just pass that:
body = soup.findAll("tr", args)
If you [args], you would end up with something like [("odd","even")].
Also str([i for i in body]) makes no real sense, it would be the same as just doing str(body) but I don't see how that format could be useful.
First of all this is my code-:
from twisted.internet import reactor
from scrapy.crawler import CrawlerProcess, CrawlerRunner
import scrapy
#from scrapy import log, signals
from scrapy.utils.log import configure_logging
from scrapy.utils.project import get_project_settings
from scrapy.settings import Settings
import datetime
from multiprocessing import Process, Queue
import os
from scrapy.http import Request
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import re
#query=raw_input("Enter a product to search for= ")
query='apple'
query1=query.replace(" ", "+")
class DmozItem(scrapy.Item):
productname = scrapy.Field()
product_link = scrapy.Field()
current_price = scrapy.Field()
mrp = scrapy.Field()
offer = scrapy.Field()
imageurl = scrapy.Field()
outofstock_status = scrapy.Field()
add = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["http://www.bestmercato.com"]
def start_requests(self):
task_urls = [
]
i=1
for i in range(1,2):
temp=("https://www.bestmercato.com/index.php?route=product/search&search="+query1+"&page="+str(i))
task_urls.append(temp)
i=i+1
start_urls = (task_urls)
# p=len(task_urls)
return [ Request(url = start_url) for start_url in start_urls ]
def parse(self, response):
items = []
for sel in response.xpath('//html/body/div/div/div[4]/div/div/div[5]/div'):
item = DmozItem()
item['productname'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/text()').extract())[3:-2]
item['product_link'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[#class="name"]/a/#href').extract())[3:-2]
point1 = sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]').extract()
point = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/#class').extract())[3:-2]
checker = "options" in point
item['current_price'] = ""
if checker:
i=1
p=1
while i==1:
t = str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[4]/div/select/option['+str(p)+']/text()').extract())[3:-2]
#print t
if 'Rs' not in t:
i = 2
elif 'Rs' in t:
i = 1
t= " ".join(t)
s = t.translate(None, '\ t')[:-2]
item['current_price'] = item['current_price'] + ' ; ' + s
p = p+1
item['current_price'] = item['current_price'][3:-3]
else:
item['current_price'] = 'Rs. ' + str(sel.xpath('div[#class="product-thumb"]/div[#class="small_detail"]/div[not (#class="name") or not(#class="description") or not(#class="qty") or not(#class="box_btn_icon")]/text()').extract())[46:-169]
re.findall(r"[-+]?\d*\.\d+|\d+", item["current_price"])
try:
test1 = str(sel.xpath('div/div[2]/div[3]/span[1]/text()').extract())[3:-2]
_digits = re.compile('\d')
if bool(_digits.search(test1)):
print 'hi'
test1=test1[:2]+'. '+test1[3:]
item['mrp'] = test1
#item['mrp'][2:2]='.'
test2 = str(sel.xpath('div/div[2]/div[3]/span[2]/text()').extract())[3:-2]
test2=test2[:2]+'. '+test2[3:]
item['current_price']=test2
else:
item['mrp'] = item['current_price']
except:
item['mrp'] = item['current_price']
item['offer'] = 'No additional offer available'
item['imageurl'] = str(sel.xpath('div[#class="product-thumb"]/div[#class="image"]/a[not (#class="sft_quickshop_icon")]/img[#class="img-responsive"]/#src').extract())[3:-2]
item['outofstock_status'] = str('In Stock')
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
# print item
items.append(item)
return request
print (items)
def parse2(self, response):
item = response.meta['item']
item['add'] = response.url
return item
spider1 = DmozSpider()
settings = Settings()
settings.set("PROJECT", "dmoz")
settings.set("CONCURRENT_REQUESTS" , 100)
#)
#settings.set( "DEPTH_PRIORITY" , 1)
#settings.set("SCHEDULER_DISK_QUEUE" , "scrapy.squeues.PickleFifoDiskQueue")
#settings.set( "SCHEDULER_MEMORY_QUEUE" , "scrapy.squeues.FifoMemoryQueue")
crawler = CrawlerProcess(settings)
crawler.crawl(spider1)
crawler.start()
Now, these are the issues that I am facing.
1. There are numerous divs that can be found with this xpath - '//html/body/div/div/div[4]/div/div/div[5]/div' . However, the above code scrapes the contents only of the first div , i.e , having the xpath 'html/body/div/div/div[4]/div/div/div[5]/div[1]' , and not all of them.
The moment I comment these three lines, the scraper scrapes everything, obviously then I am not able to add the 'add' field in the item-:
request = Request(str(item['product_link']),callback=self.parse2, dont_filter=True)
request.meta['item'] = item
return request
So, I want to scrape all the divs , in addition with the 'add' field in my item Class (notice the class DmozItem). How do I do that? Please give a corrected code for my SPECIFIC case, it would be best that way!
2. Secondly, as I said, as I comment the three lines, that I mentioned above, then the program scrapes everything in a time close to 5 seconds (around 4.9 seconds).
But as soon as I un-comment, those 3 lines (again those that I mentioned above), the program's run-time exceeds drastically, and it runs in a time close to 9 seconds (around 8.8 - 8.9 seconds). Why does this happen? Is that because of this - dont_filter=True? Please suggest ways to overcome this, as the run-time can prove to be a very big problem for me. Also, can I decrease the initial time of 5 seconds (around 4.9) somehow?
Use html/body/div/div/div[4]/div/div/div[5]//div to get all divs after div[5].
EDIT:
This is the correct xpath - //html/body/div/div/div[4]/div/div/div[5]/div, that gave all the div's after div[5]. The previous one mentioned, gave multiple errors!
If you do a return statement inside the loop you end the whole method execution. So if you enable those three lines you end the execution of your method (and the for loop) after the first element.
This means you should yield your request instead of returning it.
In a Scrapy project I am doing, I am having difficulties in sending a variable containing a list from one function to another. I need to do so, as I need to combine the values from one page along with another at the end of the script. The code is as follows:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
from six import string_types
from datetime import datetime
from decimal import Decimal
import itertools
import numpy
import urlparse
import scrapy
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["ca.finance.yahoo.com", "http://eoddata.com/"]
start_urls = [
"http://eoddata.com/stocklist/TSX.htm"
]
def parse(self,response):
companyList = response.xpath('//tr[#class="ro"]/td/a/text()').extract()
for company in companyList:
go = 'https://ca.finance.yahoo.com/q/hp?s={0}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m'.format(company)
for link in go:
yield Request(go, self.stocks1)
def stocks1(self, response):
# global returns_page1
# EAFP = Easier to ask for forgiveness then permission
# Gathers ONLY adjusted closing stock price
global returns_page1
returns_page1 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page1.append(values)
except ValueError:
continue
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks2)
def stocks2(self, response):
item = Website()
global returns_page1
returns_page2 = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
datetime.strptime(cells[0], "%b %d, %Y")
values = cells[-1]
returns_page2.append(values)
except ValueError:
continue
returns_tot = returns_page1 + returns_page2
returns_dec = [Decimal(float(i)) for i in returns_tot]
returns = [float(n) for n in returns_dec]
items = []
item = Website()
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avgreturns'] = numpy.mean(returns)
item['varreturns'] = numpy.var(returns)
item['sdreturns'] = numpy.std(returns)
item['returns'] = returns
items.append(item)
yield item
I am trying to combine returns_page1 from the def stocks1 function with returns_page2 that is gathered in the def stocks2 function. However my output is only giving me the values from the returns_page2 variable.
I know I can't put a return in the def stocks1 function because I have a yield in there. That's why I tried using global variables.
What am I doing wrong here?
Best way of passing values from one function to another is using meta in request,
in first function
yield Request(next_page, self.stocks2, meta={'returns_page1': returns_page1})
in second function
returns_page1 = response.meta.get('returns_page1')