Scraping HTML into CSV - python

I want to extract the contents like side-effects, warning, dosage from the site mentioned in the start urls. The following is my code. The csv file is getting created but nothing is displayed. The output is:
before for
[] # it is displaying empty list
after for
This is my code:
from scrapy.selector import Selector
from medicinelist_sample.items import MedicinelistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class MedSpider(CrawlSpider):
name = "med"
allowed_domains = ["medindia.net"]
start_urls = ["http://www.medindia.net/doctors/drug_information/home.asp?alpha=z"]
rules = [Rule(SgmlLinkExtractor(allow=('Zafirlukast.htm',)), callback="parse", follow = True),]
global Selector
def parse(self, response):
hxs = Selector(response)
fullDesc = hxs.xpath('//div[#class="report-content"]//b/text()')
final = fullDesc.extract()
print "before for" # this is just to see if it was printing
print final
print "after for" # this is just to see if it was printing

Your scrapy spider class's parse method should return item(s). With the current code, I do not see any item being returned. An example would be,
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
sel = Selector(response)
item = Item()
item['id'] = sel.xpath('//td[#id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = sel.xpath('//td[#id="item_name"]/text()').extract()
item['description'] = sel.xpath('//td[#id="item_description"]/text()').extract()
return item
For more information, take a look at the CrawlSpider example in the official scrapy docs.

Another problem in your code is that you are overriding the CrawlSpider's parse method to implement callback logic. This mustn't be done with CrawlSpiders since the parse method is used in its logic.
Ashish Nitin Patil has implicitly noted that already by naming his example function *parse_item*.
What the default implementation of a Crawl Spider's parse method basically does is to call the callbacks, that you've specified in the rule definitions; so if you override it, I think your callbacks won't be called at all. See Scrapy Doc - crawling rules

I just have experimented a bit with the site that you are crawling. As you would like to extract some data about the medicine (like the name, indications, contraindications, etc.) out the different sites on this domain: Wouldn't the following or similar XPath expressions fit your needs? I think your current query would give you just the "headers", but the actual info on this site is in the textnodes that follow those bold-rendered headers.
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from Test.items import TestItem
from scrapy.item import Item, Field
class Medicine(Item):
name = Field()
dosage = Field()
indications = Field()
contraindications = Field()
warnings = Field()
class TestmedSpider(CrawlSpider):
name = 'testmed'
allowed_domains = ['http://www.medindia.net/doctors/drug_information/']
start_urls = ['http://www.http://www.medindia.net/doctors/drug_information/']
rules = (
Rule(SgmlLinkExtractor(allow=r'Zafirlukast.htm'), callback='parse_item', follow=True),
)
def parse_item(self, response):
drug_info = Medicine()
selector = Selector(response)
name = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Generic Name')]//..//following-sibling::text()[1])''')
dosage = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Dosage')]//..//following-sibling::text()[1])''')
indications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Why it is prescribed (Indications)')]//..//following-sibling::text()[1])''')
contraindications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Contraindications')]//..//following-sibling::text()[1])''')
warnings = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Warnings and Precautions')]//..//following-sibling::text()[1])''')
drug_info['name'] = name.extract()
drug_info['dosage'] = dosage.extract()
drug_info['indications'] = indications.extract()
drug_info['contraindications'] = contraindications.extract()
drug_info['warnings'] = warnings.extract()
return drug_info
This would give you the following infos:
>scrapy parse --spider=testmed --verbose -d 2 -c parse_item --logfile C:\Python27\Scripts\Test\Test\spiders\test.log http://www.medindia.net/doctors/drug_information/Zafirlukast.htm
>>> DEPTH LEVEL: 1 <<<
# Scraped Items ------------------------------------------------------------
[{'contraindications': [u'Hypersensitivity.'],
'dosage': [u'Adult- The recommended dose is 20 mg twice daily.'],
'indications': [u'This medication is an oral leukotriene receptor antagonist (
LTRA), prescribed for asthma. \xa0It blocks the action of certain natural substa
nces that cause swelling and tightening of the airways.'],
'name': [u'\xa0Zafirlukast'],
'warnings': [u'Caution should be exercised in patients with history of liver d
isease, mental problems, suicidal thoughts, any allergy, elderly, during pregnan
cy and breastfeeding.']}]

Related

Scrape multilevel menu using Scrapy 1.5

I am trying to get all links from multilevel menu.
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
import scrapy
from foodisgood.items import FoodisgoodItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BbcSpider(CrawlSpider):
name = 'bbc'
allowed_domains = ['bbcgoodfood.com']
start_urls = ['https://www.bbcgoodfood.com/recipes/category/ingredients']
rules = (
Rule(LinkExtractor(allow=(r'/recipes/category/[\w-]+$'), restrict_xpaths='//article[contains(#class, "cleargridindent")]'), callback='parse_sub_categories', follow=True),
Rule(LinkExtractor(allow=(r'/recipes/collection/[\w-]+$'), restrict_xpaths='//article[contains(#class, "cleargridindent")]'), callback='parse_collections', follow=True),
)
def parse_sub_categories(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('category_title', '//h1[#class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
def parse_collections(self, response):
l = ItemLoader(item=FoodisgoodItem(), response=response)
l.default_output_processor = TakeFirst()
l.add_xpath('collection_title', '//h1[#class="section-head--title"]/text()')
l.add_value('page_url', response.url)
yield l.load_item()
Results of menu scraping
But I cant understand how populate empty first column before collection title.
For now I have:
EMPTY | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
But I need:
Meat | Steak recipes | https://www.bbcgoodfood.com/recipes/collection/steak
Can somebody give me advise what need to do to get result with subcategory in first column?
Thanks to everyone)
What you want is not really doable using a CrawlSpider's rules (at least not in a simple way).
The usual way to do this is documented in Passing additional data to callback functions.
You would extract the category in your first callback, and then create a new request passing this information in the meta dict.

Scrapy: Scraping very select URLs

I am trying to scrape yahoo stocks for a school project, but I have no idea how to go through each link of a page with a very certain link. The goal is to iterate through each stock with a certain ending portion of the url like so:
Starting URL = ["https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"]
The next URL would be something like:
#Canadian Imperial(note the "CM"):
"https://ca.finance.yahoo.com/q/hp?s=CM.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
#Blackberry (note the "BB"):
"https://ca.finance.yahoo.com/q/hp?s=BB.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
etc...
In other words, the only thing that would change would be the characters between "hp?s=" and ".TO&a".
Wondering if this is possible or not. The ending portion of the URL must stay the same as that is the page I need to get to. Unfortuantely, there is no links within each page on yahoo to go to other stocks.
If I could do this with Scrapy's Rules and SmglLinkExtractor, that would be preferable.
Would appreciate any help!
Thanks!
Current Scrapy code:
from scrapy.spider import Spider
from scrapy.selector import Selector
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class DmozSpider(Spider):
name = "dmoz"
allowed_domains = ["ca.finance.yahoo.com"]
start_urls = [
"https://ca.finance.yahoo.com/q/hp?s=BMO.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
]
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"), follow=True)
]
def parse(self, response):
item = Website()
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
print item['name']
Make a rule to follow the links matching the pattern:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+&a=\d+&b=\d+&c=\d+&d=\d+&e=\d+&f=\d+&g=m"), follow=True)
]
Though, I am not sure that you need to check for all URL parameters here. Simplified version:
rules = [
Rule(LinkExtractor(allow=r"/q/hp\?s=\w+\.\w+"), follow=True)
]
And, don't forget the imports:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
Here's an example of what I was talking about in the comments i left.
import urllib
import os
company_symbol = ["ACGL", "AFSI", "AGII", "AGNC", "ANAT", "ARCP", "ASBC", "ASPS", "BANF", "BBCN", "BGCP", "BNCL", "BOKF", "BPOP", "BRKL", "CACC", "CATY", "CBOE", "CBSH", "CFFN", "CHFC", "CINF", "CME ", "COLB", "CVBF", "ERIE", "ESGR", "ETFC", "EWBC", "EZPW", "FCFS", "FCNC", "FFBC", "FFIN", "FITB", "FMBI", "FMER", "FNFG", "FNGN", "FSRV", "FULT", "GBCI", "GLPI", "GLRE", "HBAN", "HBHC", "HLSS", "HOMB", "IBKC", "IBKR", "IBOC", "IPCC", "ISBC", "KRNY", "LPLA", "MBFI", "MHLD", "MKTX", "MTGE", "NAVG", "NBTB", "NDAQ", "NFBK", "NPBC", "NTRS", "NWBI", "ORIT", "OZRK", "PACW", "PBCT", "PCH ", "PNFP", "PRAA", "PVTB", "ROIC", "SAFT", "SBNY", "SBRA", "SCBT", "SEIC", "SIGI", "SIVB", "SLM ", "STFC", "SUSQ", "TCBI", "TFSL", "TRMK", "TROW", "UBSI", "UMBF", "UMPQ", "VRTS", "WABC", "WAFD", "WETF", "WRLD", "WTFC", "Z", "ZION"]
for company in company_symbol:
url = 'http://finance.google.com/finance/info?client=ig&q={0}:{1}'.format(company, 'NASDAQ')
nasdaq = urllib.urlopen(url)
text = nasdaq.read()
filename = 'nasdaq.txt'.format(company)
with file(filename, 'a') as output:
output.write(str(text))
This code will was written as an example of one way to change urls and do something with each url.
If you need to scrape only predefined quotes for given period, then the logic is following:
Prepare the list of quotes you interested in e.g. ['ABC', 'XYZ', 'LOL', ...].
Use basic scrapy.Spider.
Define start_requests() method and yield a sequence of requests from it.
Sample implementation:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["ca.finance.yahoo.com"]
quotes = ["BMO", "CM", "BB"]
url_template = "https://ca.finance.yahoo.com/q/hp?s=%s.TO\
&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
def start_requests(self):
for quote in self.quotes:
url = self.url_template % quote
yield Request(url)
def parse(self, response):
# process
But if you need to get ALL TSX quotes data, then I would recommend you to scrape them from available listings and then use as in above example. Crawling the entire ca.finance.yahoo.com is obviously a bad idea.
If you have a list of stocks you want to load the yahoo page for, you can get a list of the yahoo urls like this:
url_template = "https://ca.finance.yahoo.com/q/hp?s={}.TO&a=02&b=2&c=2005&d=02&e=2&f=2015&g=m"
stocks = ['CM', 'BB']
urls = [url_template.format(stock) for stock in stocks]
I haven't used scrapy, though, so I'm not sure if this is what you need.

How scrape items from 2 different sections?

im new with Scrapy and web crawling and I've been working on the page www.mercadolibre.com.mx I have to get (from the startpage) some data (descripton and prices) about the produtcs displayed in there. Here is my items.py:
from scrapy.item import Item, Field
class PruebaMercadolibreItem(Item):
producto = Field()
precio = Field()
And here is my spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(BaseSpider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='item-data']")
items = []
for titles in titles:
item = PruebaMercadolibreItem()
item["producto"] = titles.select("p[#class='tit le']/#title").extract()
item["precio"] = titles.select("span[#class='ch-price']/text()").extract()
items.append(item)
return items
The problem is that I get the same results in when I change this line:
titles = hxs.select("//div[#class='item-data']")
To this:
titles = hxs.select("//div[#class='item-data'] | //div[#class='item-data item-data-mp']")
And Im not getting the same data as when I use the first line.
Can anyone help me? do I have any errorin my xPath selection?
Also I cant find a good tutorial for using MySQL with scrapy, I would appreciate any help. Thx
Better use contains if you want to get all div tags containing item-data class:
titles = hxs.select("//div[contains(#class, 'item-data')]")
Also, you have other problems in the spider:
the loop, you are overriding the titles
class name in producto xpath should be title, not tit le
you probably don't want to have lists in Field values, get the first items out of the extracted lists
HtmlXPathSelector is deprecated, use Selector instead
select() is deprecated, use xpath() instead
BaseSpider has been renamed to Spider
Here's the code with modifications:
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy.item import Item, Field
from prueba_mercadolibre.items import PruebaMercadolibreItem
class MLSpider(Spider):
name = "mlspider"
allowed_domains = ["mercadolibre.com"]
start_urls = ["http://www.mercadolibre.com.mx"]
def parse (self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[contains(#class, 'item-data')]")
for title in titles:
item = PruebaMercadolibreItem()
item["producto"] = title.xpath("p[#class='title']/#title").extract()[0]
item["precio"] = title.xpath("span[#class='ch-price']/text()").extract()[0]
yield item
Example items from the output:
{'precio': u'$ 35,000', 'producto': u'Cuatrimoto, Utv De 500cc 4x4 ,moto , Motos, Atv ,'}
{'precio': u'$ 695', 'producto': u'Reloj Esp\xeda Camara Oculta Video Hd 16 Gb! Sony Compara.'}

Not able to following links in Scrapy

I'm beginning now with Scrapy, and I got how to take the content I wanted from a sport page (name and team of a soccer player), but I need to follow the links searching for more teams, every team page have a link to players page, the structure of website link is :
team page: http://esporte.uol.com.br/futebol/clubes/vitoria/
players page: http://esporte.uol.com.br/futebol/clubes/vitoria/jogadores/
I've read some Scrapy tutorials and I'm thinking the team pages I have to follow links and don't parse nothing, and the players page I have to no-follow and parse the players, I don't know if I'm right with this idea and wrong with the syntax, of if my idea of follow is wrong, any help is welcome.
here is my code:
class MoneyballSpider(BaseSpider):
name = "moneyball"
allowed_domains = ["esporte.uol.com.br", "click.uol.com.br", "uol.com.br"]
start_urls = ["http://esporte.uol.com.br/futebol/clubes/vitoria/jogadores/"]
rules = (
Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*/', ), deny=(r'.*futebol/clubes/.*/jogadores/', )), follow = True),
Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*/jogadores/', )), callback='parse', follow = True),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
jogadores = hxs.select('//div[#id="jogadores"]/div/ul/li')
items = []
for jogador in jogadores:
item = JogadorItem()
item['nome'] = jogador.select('h5/a/text()').extract()
item['time'] = hxs.select('//div[#class="header clube"]/h1/a/text()').extract()
items.append(item)
print item['nome'], item['time']
return items
First, since you need to follow an extract links, you need a CrawlSpider instead of a BaseSpider. Then, you need to define two rules: one for players with a callback, and one for teams without, to follow. Also, you should start with a URL with list of teams, like http://esporte.uol.com.br/futebol. Here's a complete spider, that returns players from different teams:
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class JogadorItem(Item):
nome = Field()
time = Field()
class MoneyballSpider(CrawlSpider):
name = "moneyball"
allowed_domains = ["esporte.uol.com.br", "click.uol.com.br", "uol.com.br"]
start_urls = ["http://esporte.uol.com.br/futebol"]
rules = (Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*?/jogadores/', )), callback='parse_players', follow=True),
Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*', )), follow=True),)
def parse_players(self, response):
hxs = HtmlXPathSelector(response)
jogadores = hxs.select('//div[#id="jogadores"]/div/ul/li')
items = []
for jogador in jogadores:
item = JogadorItem()
item['nome'] = jogador.select('h5/a/text()').extract()
item['time'] = hxs.select('//div[#class="header clube"]/h1/a/text()').extract()
items.append(item)
print item['nome'], item['time']
return items
Quote from the output:
...
[u'Silva'] [u'Vila Nova-GO']
[u'Luizinho'] [u'Vila Nova-GO']
...
[u'Michel'] [u'Guarani']
[u'Wellyson'] [u'Guarani']
...
This is just hint for you to continue working on the spider, you'll need to tweak the spider further: choose an appropriate start URL depending on your needs etc.
Hope that helps.

Advice extracting //td text and numbers

I have been working through the tutorial adapting it to a project I want to achieve. I seem to have something going wrong that i just can't find the error to.
When using 'scrapy shell' I can get the response I expect. So for this site Nrl Ladder
In [1]: hxs.select('//td').extract()
Out[1]:
[u'<td>\r\n<div id="ls-nav">\r\n<ul><li><span>Home</span></li>\r\n<li class="ls-nav-on"><span>NRL</span></li>\r\n<li><span>NYC</span></li>\r\n<li><span>Rep Matches</span></li>\r\n\r\n</ul></div>\r\n</td>',
u'<td style="text-align:left" colspan="5">Round 4</td>',
u'<td colspan="5">Updated: 26/3/2012</td>',
u'<td style="text-align:left">1. Melbourne</td>',
u'<td>4</td>',
u'<td>4</td>',
u'<td>0</td>',
u'<td>0</td>',
u'<td>0</td>',
u'<td>122</td>',
u'<td>39</td>',
u'<td>83</td>',
u'<td>8</td>',
u'<td style="text-align:left">2. Canterbury-Bankstown</td>',
And on it goes.
I am really struggling to understand how to alter the tutorial project to change it to a different data type.
Is there anyway to bring up a help or documentation list to see what types I should use in items when using 'td' or any other item. Like i say it works easy in the shell but I cannot transform it to the files. Specifically both the team names and the points are 'td' but the team name is text.
here is what I have done.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from nrl.items import NrlItem
class nrl(BaseSpider):
name = "nrl"
allowed_domains = ["http://live.nrlstats.com/"]
start_urls = [
"http://live.nrlstats.com/nrl/ladder.html",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//td')
items = []
for site in sites:
item = nrlItem()
item['team'] = site.select('/text()').extract()
item['points'] = site.select('/').extract()
items.append(item)
return items
I didn't quite understand your question, but here is a starting point, imo (haven't tested; see some comments in the code):
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from nrl.items import NrlItem
class nrl(BaseSpider):
name = "nrl"
allowed_domains = ["live.nrlstats.com"] # domains should be like this
start_urls = [
"http://live.nrlstats.com/nrl/ladder.html",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//table[#class="tabler"]//tr[starts-with(#class, "r")]') # select team rows
items = []
for row in rows:
item = nrlItem()
columns = row.select('./td/text()').extract() # select columns for the selected row
item['team'] = columns[0]
item['P'] = int(columns[1])
item['W'] = int(columns[2])
...
items.append(item)
return items
UPDATE:
//table[#class="tabler"//tr[starts-with(#class, "r")] is an xpath query. See some xpath examples here.
hxs.select(xpath_query) always returns a list of nodes (also of type HtmlXPathSelector) which fall under the given query.
hxs.extract() returns string representation of the node(s).
P.S. Beware that scrapy supports XPath 1.0, but not 2.0 (at least on Linux, not sure about Windows), so some of the newest xpath features might not work.
See also:
http://doc.scrapy.org/en/latest/topics/selectors.html
http://doc.scrapy.org/en/latest/topics/firefox.html

Categories

Resources