Scrapy: Scraping very select URLs

Scrapy: Scraping very select URLs - python

I am trying to scrape yahoo stocks for a school project, but I have no idea how to go through each link of a page with a very certain link. The goal is to iterate through each stock with a certain ending portion of the url like so:
Starting URL = ["https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"]
The next URL would be something like:
#Canadian Imperial(note the "CM"):
"https://ca.finance.yahoo.com/q/hp?s=CM.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
#Blackberry (note the "BB"):
"https://ca.finance.yahoo.com/q/hp?s=BB.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
etc...
In other words, the only thing that would change would be the characters between "hp?s=" and ".TO&a".
Wondering if this is possible or not. The ending portion of the URL must stay the same as that is the page I need to get to. Unfortuantely, there is no links within each page on yahoo to go to other stocks.
If I could do this with Scrapy's Rules and SmglLinkExtractor, that would be preferable.
Would appreciate any help!
Thanks!
Current Scrapy code:
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["ca.finance.yahoo.com"]
start_urls = [
"https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
]
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"), follow=True)
]
def parse(self, response):
item = Website()
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
print item['name']

Make a rule to follow the links matching the pattern:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+&a=\d+&b=\d+&c=\d+&d=\d+&e=\d+&f=\d+&g=m"), follow=True)
]
Though, I am not sure that you need to check for all URL parameters here. Simplified version:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+"), follow=True)
]
And, don't forget the imports:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor

Here's an example of what I was talking about in the comments i left.
import urllib
import os
company_symbol = ["ACGL", "AFSI", "AGII", "AGNC", "ANAT", "ARCP", "ASBC", "ASPS", "BANF", "BBCN", "BGCP", "BNCL", "BOKF", "BPOP", "BRKL", "CACC", "CATY", "CBOE", "CBSH", "CFFN", "CHFC", "CINF", "CME ", "COLB", "CVBF", "ERIE", "ESGR", "ETFC", "EWBC", "EZPW", "FCFS", "FCNC", "FFBC", "FFIN", "FITB", "FMBI", "FMER", "FNFG", "FNGN", "FSRV", "FULT", "GBCI", "GLPI", "GLRE", "HBAN", "HBHC", "HLSS", "HOMB", "IBKC", "IBKR", "IBOC", "IPCC", "ISBC", "KRNY", "LPLA", "MBFI", "MHLD", "MKTX", "MTGE", "NAVG", "NBTB", "NDAQ", "NFBK", "NPBC", "NTRS", "NWBI", "ORIT", "OZRK", "PACW", "PBCT", "PCH ", "PNFP", "PRAA", "PVTB", "ROIC", "SAFT", "SBNY", "SBRA", "SCBT", "SEIC", "SIGI", "SIVB", "SLM ", "STFC", "SUSQ", "TCBI", "TFSL", "TRMK", "TROW", "UBSI", "UMBF", "UMPQ", "VRTS", "WABC", "WAFD", "WETF", "WRLD", "WTFC", "Z", "ZION"]
for company in company_symbol:
url = 'http://finance.google.com/finance/info?client=ig&q={0}:{1}'.format(company, 'NASDAQ')
nasdaq = urllib.urlopen(url)
text = nasdaq.read()
filename = 'nasdaq.txt'.format(company)
with file(filename, 'a') as output:
output.write(str(text))
This code will was written as an example of one way to change urls and do something with each url.

If you need to scrape only predefined quotes for given period, then the logic is following:
Prepare the list of quotes you interested in e.g. ['ABC', 'XYZ', 'LOL', ...].
Use basic scrapy.Spider.
Define start_requests() method and yield a sequence of requests from it.
Sample implementation:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["ca.finance.yahoo.com"]
quotes = ["BMO", "CM", "BB"]
url_template = "https://ca.finance.yahoo.com/q/hp?s=%s.TO\
&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
def start_requests(self):
for quote in self.quotes:
url = self.url_template % quote
yield Request(url)
def parse(self, response):
# process
But if you need to get ALL TSX quotes data, then I would recommend you to scrape them from available listings and then use as in above example. Crawling the entire ca.finance.yahoo.com is obviously a bad idea.

If you have a list of stocks you want to load the yahoo page for, you can get a list of the yahoo urls like this:
url_template = "https://ca.finance.yahoo.com/q/hp?s={}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
stocks = ['CM', 'BB']
urls = [url_template.format(stock) for stock in stocks]
I haven't used scrapy, though, so I'm not sure if this is what you need.

Related

Scrape multilevel menu using Scrapy 1.5

I am trying to get all links from multilevel menu.
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[\w-]+$'), restrict_xpaths='//article[contains(#class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[\w-]+$'), restrict_xpaths='//article[contains(#class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[#class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[#class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
But I cant understand how populate empty first column before collection title.
For now I have:
EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
But I need:
Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
Can somebody give me advise what need to do to get result with subcategory in first column?
Thanks to everyone)

What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.

python scrapy code prints out the file I am reading from

I have written some python code with scrapy to extract some addresses from a website.
The first part of the code is putting together the start_urls by reading the latitude and longitude coordinates from a separate file googlecoords.txt which then form part of the start_urls. (The googlecoords.txt file I prepared previously converts UK postcodes in google coordinates for googlemaps).
So, for example, the first item in the start_url list is "https://www.howdens.com/process/searchLocationsNear.php?lat=53.674434&lon=-1.4908923&distance=1000&units=MILES" where "lat=53.674434&lon=-1.4908923" have come from the googlecoors.txt file.
However, when I run the code it works perfectly except that it prints out the googlecoords.txt file first - which I don't need.
How do I stop this print happening? (Though I can live with it.)
import scrapy
import sys
from scrapy.http import FormRequest, Request
from Howdens.items import HowdensItem
class howdensSpider(scrapy.Spider):
name = "howdens"
allowed_domains = ["www.howdens.com"]
# read the file that has a list of google coordinates that are converted from postcodes
with open("googlecoords.txt") as f:
googlecoords = [x.strip('\n') for x in f.readlines()]
# from the goole coordinates build the start URLs
start_urls = []
for a in range(len(googlecoords)):
start_urls.append("https://www.howdens.com/process/searchLocationsNear.php?{}&distance=1000&units=MILES".format(googlecoords[a]))
# cycle through 6 of the first relevant items returned in the text
def parse(self, response):
for sel in response.xpath('/html/body'):
for i in range(0,6):
try:
item = HowdensItem()
item['name'] =sel.xpath('.//text()').re(r'(?<="name":")(.*?)(?=","street")')[i]
item['street'] =sel.xpath('.//text()').re(r'(?<="street":")(.*?)(?=","town")')[i]
item['town'] = sel.xpath('.//text()').re(r'(?<="town":")(.*?)(?=","pc")')[i]
item['pc'] = sel.xpath('.//text()').re(r'(?<="pc":")(.*?)(?=","state")')[i]
yield item
except IndexError:
pass

Like someone in the comments pointed out you should load it up with json module in start_requests() method:
import scrapy
import json
class MySpider(scrapy.Spider):
start_urls = ['https://www.howdens.com/process/searchLocationsNear.php?lat=53.674434&lon=-1.4908923&distance=1000&units=MILES']
def parse(self, response):
data = json.loads(response.body_as_unicode())
items = data['response']['depots']
for item in items:
url_template = "https://www.howdens.com/process/searchLocationsNear.php?{}&distance=1000&units=MILES"
url = url_template.format(item['lat']) # format in your location here
yield scrapy.Request(url, self.parse_item)
def parse_item(self, response):
print(response.url)

Scrapy correct xpath for unnamed div with image and text

I am building a Spider that traverses through several paginated pages and extracts data from the site:
http://www.usnews.com/education/best-global-universities/neuroscience-behavior
This is the spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
from usnews.items import UsnewsItem
class UniversitiesSpider(scrapy.Spider):
name = "universities"
allowed_domains = ["usnews.com"]
start_urls = (
'http://www.usnews.com/education/best-global-universities/neuroscience-behavior/',
)
#Rules = [
#Rule(LinkExtractor(allow=(), restrict_xpaths=('.//a[#class="pager_link"]',)), callback="parse", follow= True)
#]
def parse(self, response):
for sel in response.xpath('.//div[#class="sep"]'):
item = UsnewsItem()
item['name'] = sel.xpath('.//h2[#class="h-taut"]/a/text()').extract()
item['location'] = sel.xpath('.//span[#class="t-dim t-small"]/text()').extract()
item['ranking'] = sel.xpath('.//div[3]/div[2]/text()').extract()
item['score'] = sel.xpath('.//div[#class="t-large t-strong t-constricted"]/text()').extract()
#print(sel.xpath('.//text()').extract()
yield item
I am having problems extracting the text for the item "ranking". According to google chomes xpath suggestion the xpath is: //*[#id="resultsMain"]/div[1]/div[1]/div[3]/div[2] which gives me the single number for the first entry and a bunch of empty values. It seems to be implemented inside an img tag and I am confused on how to access it to just extract the thext (for example #1, #22 etc.)

The following XPath should find div containing img child, and then return non-empty text node child which contains the 'ranking' :
for sel in response.xpath('.//div[#class="sep"]'):
...
item['ranking'] = sel.xpath('div/div[img]/text()[normalize-space()]').extract()

Scraping HTML into CSV

I want to extract the contents like side-effects, warning, dosage from the site mentioned in the start urls. The following is my code. The csv file is getting created but nothing is displayed. The output is:
before for
[] # it is displaying empty list
after for
This is my code:
from scrapy.selector import Selector
from medicinelist_sample.items import MedicinelistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class MedSpider(CrawlSpider):
name = "med"
allowed_domains = ["medindia.net"]
start_urls = ["http://www.medindia.net/doctors/drug_information/home.asp?alpha=z"]
rules = [Rule(SgmlLinkExtractor(allow=('Zafirlukast.htm',)), callback="parse", follow = True),]
global Selector
def parse(self, response):
hxs = Selector(response)
fullDesc = hxs.xpath('//div[#class="report-content"]//b/text()')
final = fullDesc.extract()
print "before for" # this is just to see if it was printing
print final
print "after for" # this is just to see if it was printing

Your scrapy spider class's parse method should return item(s). With the current code, I do not see any item being returned. An example would be,
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
sel = Selector(response)
item = Item()
item['id'] = sel.xpath('//td[#id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = sel.xpath('//td[#id="item_name"]/text()').extract()
item['description'] = sel.xpath('//td[#id="item_description"]/text()').extract()
return item
For more information, take a look at the CrawlSpider example in the official scrapy docs.

Another problem in your code is that you are overriding the CrawlSpider's parse method to implement callback logic. This mustn't be done with CrawlSpiders since the parse method is used in its logic.
Ashish Nitin Patil has implicitly noted that already by naming his example function *parse_item*.
What the default implementation of a Crawl Spider's parse method basically does is to call the callbacks, that you've specified in the rule definitions; so if you override it, I think your callbacks won't be called at all. See Scrapy Doc - crawling rules

I just have experimented a bit with the site that you are crawling. As you would like to extract some data about the medicine (like the name, indications, contraindications, etc.) out the different sites on this domain: Wouldn't the following or similar XPath expressions fit your needs? I think your current query would give you just the "headers", but the actual info on this site is in the textnodes that follow those bold-rendered headers.
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from Test.items import TestItem
from scrapy.item import Item, Field
class Medicine(Item):
name = Field()
dosage = Field()
indications = Field()
contraindications = Field()
warnings = Field()
class TestmedSpider(CrawlSpider):
name = 'testmed'
allowed_domains = ['http://www.medindia.net/doctors/drug_information/']
start_urls = ['http://www.http://www.medindia.net/doctors/drug_information/']
rules = (
Rule(SgmlLinkExtractor(allow=r'Zafirlukast.htm'), callback='parse_item', follow=True),
)
def parse_item(self, response):
drug_info = Medicine()
selector = Selector(response)
name = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Generic Name')]//..//following-sibling::text()[1])''')
dosage = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Dosage')]//..//following-sibling::text()[1])''')
indications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Why it is prescribed (Indications)')]//..//following-sibling::text()[1])''')
contraindications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Contraindications')]//..//following-sibling::text()[1])''')
warnings = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Warnings and Precautions')]//..//following-sibling::text()[1])''')
drug_info['name'] = name.extract()
drug_info['dosage'] = dosage.extract()
drug_info['indications'] = indications.extract()
drug_info['contraindications'] = contraindications.extract()
drug_info['warnings'] = warnings.extract()
return drug_info
This would give you the following infos:
>scrapy parse --spider=testmed --verbose -d 2 -c parse_item --logfile C:\Python27\Scripts\Test\Test\spiders\test.log http://www.medindia.net/doctors/drug_information/Zafirlukast.htm
>>> DEPTH LEVEL: 1 <<<
# Scraped Items ------------------------------------------------------------
[{'contraindications': [u'Hypersensitivity.'],
'dosage': [u'Adult- The recommended dose is 20 mg twice daily.'],
'indications': [u'This medication is an oral leukotriene receptor antagonist (
LTRA), prescribed for asthma. \xa0It blocks the action of certain natural substa
nces that cause swelling and tightening of the airways.'],
'name': [u'\xa0Zafirlukast'],
'warnings': [u'Caution should be exercised in patients with history of liver d
isease, mental problems, suicidal thoughts, any allergy, elderly, during pregnan
cy and breastfeeding.']}]

Advice extracting //td text and numbers

I have been working through the tutorial adapting it to a project I want to achieve. I seem to have something going wrong that i just can't find the error to.
When using 'scrapy shell' I can get the response I expect. So for this site Nrl Ladder
In [1]: hxs.select('//td').extract()
Out[1]:
[u'<td>\r\n<div id="ls-nav">\r\n<ul><li><span>Home</span></li>\r\n<li class="ls-nav-on"><span>NRL</span></li>\r\n<li><span>NYC</span></li>\r\n<li><span>Rep Matches</span></li>\r\n\r\n</ul></div>\r\n</td>',
u'<td style="text-align:left" colspan="5">Round 4</td>',
u'<td colspan="5">Updated: 26/3/2012</td>',
u'<td style="text-align:left">1. Melbourne</td>',
u'<td>4</td>',
u'<td>4</td>',
u'<td>0</td>',
u'<td>0</td>',
u'<td>0</td>',
u'<td>122</td>',
u'<td>39</td>',
u'<td>83</td>',
u'<td>8</td>',
u'<td style="text-align:left">2. Canterbury-Bankstown</td>',
And on it goes.
I am really struggling to understand how to alter the tutorial project to change it to a different data type.
Is there anyway to bring up a help or documentation list to see what types I should use in items when using 'td' or any other item. Like i say it works easy in the shell but I cannot transform it to the files. Specifically both the team names and the points are 'td' but the team name is text.
here is what I have done.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from nrl.items import NrlItem
class nrl(BaseSpider):
name = "nrl"
allowed_domains = ["http://live.nrlstats.com/"]
start_urls = [
"http://live.nrlstats.com/nrl/ladder.html",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//td')
items = []
for site in sites:
item = nrlItem()
item['team'] = site.select('/text()').extract()
item['points'] = site.select('/').extract()
items.append(item)
return items

I didn't quite understand your question, but here is a starting point, imo (haven't tested; see some comments in the code):
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from nrl.items import NrlItem
class nrl(BaseSpider):
name = "nrl"
allowed_domains = ["live.nrlstats.com"] # domains should be like this
start_urls = [
"http://live.nrlstats.com/nrl/ladder.html",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tabler"]//tr[starts-with(#class, "r")]') # select team rows
items = []
for row in rows:
item = nrlItem()
columns = row.select('./td/text()').extract() # select columns for the selected row
item['team'] = columns[0]
item['P'] = int(columns[1])
item['W'] = int(columns[2])
...
items.append(item)
return items
UPDATE:
//table[#class="tabler"//tr[starts-with(#class, "r")] is an xpath query. See some xpath examples here.
hxs.select(xpath_query) always returns a list of nodes (also of type HtmlXPathSelector) which fall under the given query.
hxs.extract() returns string representation of the node(s).
P.S. Beware that scrapy supports XPath 1.0, but not 2.0 (at least on Linux, not sure about Windows), so some of the newest xpath features might not work.
See also:
http://doc.scrapy.org/en/latest/topics/selectors.html
http://doc.scrapy.org/en/latest/topics/firefox.html

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy: Scraping very select URLs - python

Related

Scrape multilevel menu using Scrapy 1.5

python scrapy code prints out the file I am reading from

Scrapy correct xpath for unnamed div with image and text

Scraping HTML into CSV

Advice extracting //td text and numbers

Categories

Resources