Not able to following links in Scrapy - python

I'm beginning now with Scrapy, and I got how to take the content I wanted from a sport page (name and team of a soccer player), but I need to follow the links searching for more teams, every team page have a link to players page, the structure of website link is :
team page: http://esporte.uol.com.br/futebol/clubes/vitoria/
players page: http://esporte.uol.com.br/futebol/clubes/vitoria/jogadores/
I've read some Scrapy tutorials and I'm thinking the team pages I have to follow links and don't parse nothing, and the players page I have to no-follow and parse the players, I don't know if I'm right with this idea and wrong with the syntax, of if my idea of follow is wrong, any help is welcome.
here is my code:
class MoneyballSpider(BaseSpider):
name = "moneyball"
allowed_domains = ["esporte.uol.com.br", "click.uol.com.br", "uol.com.br"]
start_urls = ["http://esporte.uol.com.br/futebol/clubes/vitoria/jogadores/"]
rules = (
Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*/', ), deny=(r'.*futebol/clubes/.*/jogadores/', )), follow = True),
Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*/jogadores/', )), callback='parse', follow = True),
)
def parse(self, response):
hxs = HtmlXPathSelector(response)
jogadores = hxs.select('//div[#id="jogadores"]/div/ul/li')
items = []
for jogador in jogadores:
item = JogadorItem()
item['nome'] = jogador.select('h5/a/text()').extract()
item['time'] = hxs.select('//div[#class="header clube"]/h1/a/text()').extract()
items.append(item)
print item['nome'], item['time']
return items

First, since you need to follow an extract links, you need a CrawlSpider instead of a BaseSpider. Then, you need to define two rules: one for players with a callback, and one for teams without, to follow. Also, you should start with a URL with list of teams, like http://esporte.uol.com.br/futebol. Here's a complete spider, that returns players from different teams:
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
class JogadorItem(Item):
nome = Field()
time = Field()
class MoneyballSpider(CrawlSpider):
name = "moneyball"
allowed_domains = ["esporte.uol.com.br", "click.uol.com.br", "uol.com.br"]
start_urls = ["http://esporte.uol.com.br/futebol"]
rules = (Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*?/jogadores/', )), callback='parse_players', follow=True),
Rule(SgmlLinkExtractor(allow=(r'.*futebol/clubes/.*', )), follow=True),)
def parse_players(self, response):
hxs = HtmlXPathSelector(response)
jogadores = hxs.select('//div[#id="jogadores"]/div/ul/li')
items = []
for jogador in jogadores:
item = JogadorItem()
item['nome'] = jogador.select('h5/a/text()').extract()
item['time'] = hxs.select('//div[#class="header clube"]/h1/a/text()').extract()
items.append(item)
print item['nome'], item['time']
return items
Quote from the output:
...
[u'Silva'] [u'Vila Nova-GO']
[u'Luizinho'] [u'Vila Nova-GO']
...
[u'Michel'] [u'Guarani']
[u'Wellyson'] [u'Guarani']
...
This is just hint for you to continue working on the spider, you'll need to tweak the spider further: choose an appropriate start URL depending on your needs etc.
Hope that helps.

Related

extract data from nested xpath

I am newbie using xpath,
I wanna extract every single title, body, link , release date from this link
everthing seems okay, but no on body, how to extract every single body on nested xPath, thanks before :)
here my source
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from thehack.items import ThehackItem
class MySpider(BaseSpider):
name = "thehack"
allowed_domains = ["thehackernews.com"]
start_urls = ["http://thehackernews.com/search/label/mobile%20hacking"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('//article[#class="post item module"]')
items = []
for titles in titles:
item = ThehackItem()
item['title'] = titles.select('span/h2/a/text()').extract()
item['link'] = titles.select('span/h2/a/#href').extract()
item['body'] = titles.select('span/div/div/div/div/a/div/text()').extract()
item['date'] = titles.select('span/div/span/text()').extract()
items.append(item)
return items
anybody can fix about body blok? only on body...
thanks before mate
here the picture of inspection elements from the website
I think you where struggling with the selectors, right? I think you should check the documentation for selectors, there's a lot of good information there. In this particular example, using the css selectors, I think it would be something like:
class MySpider(scrapy.Spider):
name = "thehack"
allowed_domains = ["thehackernews.com"]
start_urls = ["http://thehackernews.com/search/label/mobile%20hacking"]
def parse(self, response):
for article in response.css('article.post'):
item = ThehackItem()
item['title'] = article.css('.post-title>a::text').extract_first()
item['link'] = article.css('.post-title>a::attr(href)').extract_first()
item['body'] = ''. join(article.css('[id^=summary] *::text').extract()).strip()
item['date'] = article.css('[itemprop="datePublished"]::attr(content)').extract_first()
yield item
It would be a good exercise for you to change them to xpath selectors and maybe also check about ItemLoaders, together are very useful.

Scrapy: Do not crawl links on other domains page

Below id my spider I created to get all links on NecToday.com for example.
import socket
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class PropertiesItem(scrapy.Item):
# Primary fields
title = scrapy.Field()
url = scrapy.Field()
class NecSpider(CrawlSpider):
name = "NecSpider"
#allowed_domains = ["nectoday.com"]
start_urls = ["http://nectoday.com"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//a',)), callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
print(response.url)
item = PropertiesItem()
item["title"] = response.xpath("//title/text()").extract()
item["url"] = response.url
return(item)
This code starts to fetch all links present on site. Some of the pages have YouTube links as well. The problem is that once the first YouTube link is crawled, it starts to crawl other YouTube links referenced from the first YouTube link.
I want to crawl the first YouTube link, but no others. YouTube is just example. Tomorrow that can be another site as well. How can this be achieved?
Why not try something along the lines of this:
start_urls=["http://nectoday.com"]
def parse(self, response):
#parse whatever you need
for url in response.selector.xpath('//#href').extract():
if 'youtube.com' in url:
yield scrapy.Request(url, callback=self.parse_no_follow)
else:
yield scrapy.Request(url, callback=self.parse)
def parse_no_follow(self, response):
#parse whatever you want and not follow anymore links
This will only be scraping from your allowed domain.
class QuotesSpider(CrawlSpider):
name = "your app name"
n=0
allowed_domains = ['domain']
start_urls=['anywebpage']
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
QuotesSpider.n=QuotesSpider.n+1
if (len(response.body)>100):
h = html2text.HTML2Text()
h.ignore_links = True
h.ignore_images = True
h.body_width = 0
dd = response.body.decode("utf-8")
init=dd.find("<p>")
while init>0:
end = dd.find("</p>", init)
if end>0:
o=h.handle(dd[init:end+4]+"\n")
supersentences=o.split('\n')

Extracting data with Scrapy which loops subpages

There is a page on my website that contains a list of staff members. Each staff member name links to their own individual pages.
I want to output a csv file that has lists each staff member's name and title, so the spider will need to loop through each of the links on the stafflist page, pulling the names and titles.
So far, this code words only to pull out the very last name and title on the list. The problem I'm having is making it go through each person's page to get a complete list.
How do I go about making this loop work?
class scrapeSpider(scrapy.Spider):
name = "scrape"
allowed_domains = ["example.com", "example.co.uk"]
start_urls = [
'http://example.com/stafflist/',
]
def parse(self, response):
for href in response.xpath('//div[contains(concat(" ",normalize-space(#class)," "), "span8")]//a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_SCRAPE)
def parse_SCRAPE(self, response):
items = []
for sel in response.xpath('//div[contains(concat(" ",normalize-space(#class)," "), "span9")]'):
item = scrapeItem()
item['name'] = sel.xpath('h1/text()').extract()
item['titles'] = sel.xpath('h2/text()').extract()
print item['name'], item['titles']
items.append(item)
return items
Use CrawlSpider. e.g.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from myspider.items import PersonItem
from pyquery import PyQuery as pq # PyQuery is awesome!
from urlparse import urlparse, parse_qs
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['example.si']
start_urls = ['http://example.com/stafflist/']
rules = (
# if you have paginator this Rule will extract links
Rule(LinkExtractor(
restrict_xpaths=('//div[#class="paging"]//a[last()]')),
follow=True),
# restrict crawler to look for links only inside restrict_xpaths
# and then process those links with 'parse_item'
Rule(LinkExtractor(
restrict_xpaths=('//div[contains(concat(" ",normalize-space(#class)," "), "span8")]//a/#href')),
callback='parse_item',
follow=False),
)
def parse_item(self, response):
"""
process persons page
"""
self.response = response
self.doc = pq(self.response.body)
i = PersonItem()
i["name"] = self.doc("h1").text()
i["titles"] = self.doc("h2").text()
...
return i

Recursive Scraping Craigslist with Scrapy and Python 2.7

I'm having trouble getting the spider to follow the next page of ads without following every link it finds, eventually returning every craigslist page. I've played around with the rule as I know that's where the problem lies, but I either get just the first page, every page on craigslist, or nothing. Any help?
Here's my current code:
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
class PageSpider(CrawlSpider):
name = "cto"
allowed_domains = ["medford.craigslist.org"]
start_urls = ["http://medford.craigslist.org/cto/"]
rules = (
Rule(
SgmlLinkExtractor(allow_domains=("medford.craigslist.org", )),
callback='parse_page', follow=True
),
)
def parse_page(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//div[#class="content"]/p[#class="row"]')
for row in rows:
item = CraigslistSampleItem()
link = row.xpath('.//span[#class="pl"]/a')
item['title'] = link.xpath("text()").extract()
item['link'] = link.xpath("#href").extract()
item['price'] = row.xpath('.//span[#class="l2"]/span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
You should specify an allow argument of SgmlLinkExtractor:
allow (a regular expression (or list of)) – a single regular
expression (or list of regular expressions) that the (absolute) urls
must match in order to be extracted. If not given (or empty), it will
match all links.
rules = (
Rule(SgmlLinkExtractor(allow='http://medford.craigslist.org/cto/'),
callback='parse_page', follow=True),
)
This will keep all links under http://medford.craigslist.org/cto/ url.
Hope that helps.

Scraping HTML into CSV

I want to extract the contents like side-effects, warning, dosage from the site mentioned in the start urls. The following is my code. The csv file is getting created but nothing is displayed. The output is:
before for
[] # it is displaying empty list
after for
This is my code:
from scrapy.selector import Selector
from medicinelist_sample.items import MedicinelistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class MedSpider(CrawlSpider):
name = "med"
allowed_domains = ["medindia.net"]
start_urls = ["http://www.medindia.net/doctors/drug_information/home.asp?alpha=z"]
rules = [Rule(SgmlLinkExtractor(allow=('Zafirlukast.htm',)), callback="parse", follow = True),]
global Selector
def parse(self, response):
hxs = Selector(response)
fullDesc = hxs.xpath('//div[#class="report-content"]//b/text()')
final = fullDesc.extract()
print "before for" # this is just to see if it was printing
print final
print "after for" # this is just to see if it was printing
Your scrapy spider class's parse method should return item(s). With the current code, I do not see any item being returned. An example would be,
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
sel = Selector(response)
item = Item()
item['id'] = sel.xpath('//td[#id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = sel.xpath('//td[#id="item_name"]/text()').extract()
item['description'] = sel.xpath('//td[#id="item_description"]/text()').extract()
return item
For more information, take a look at the CrawlSpider example in the official scrapy docs.
Another problem in your code is that you are overriding the CrawlSpider's parse method to implement callback logic. This mustn't be done with CrawlSpiders since the parse method is used in its logic.
Ashish Nitin Patil has implicitly noted that already by naming his example function *parse_item*.
What the default implementation of a Crawl Spider's parse method basically does is to call the callbacks, that you've specified in the rule definitions; so if you override it, I think your callbacks won't be called at all. See Scrapy Doc - crawling rules
I just have experimented a bit with the site that you are crawling. As you would like to extract some data about the medicine (like the name, indications, contraindications, etc.) out the different sites on this domain: Wouldn't the following or similar XPath expressions fit your needs? I think your current query would give you just the "headers", but the actual info on this site is in the textnodes that follow those bold-rendered headers.
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from Test.items import TestItem
from scrapy.item import Item, Field
class Medicine(Item):
name = Field()
dosage = Field()
indications = Field()
contraindications = Field()
warnings = Field()
class TestmedSpider(CrawlSpider):
name = 'testmed'
allowed_domains = ['http://www.medindia.net/doctors/drug_information/']
start_urls = ['http://www.http://www.medindia.net/doctors/drug_information/']
rules = (
Rule(SgmlLinkExtractor(allow=r'Zafirlukast.htm'), callback='parse_item', follow=True),
)
def parse_item(self, response):
drug_info = Medicine()
selector = Selector(response)
name = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Generic Name')]//..//following-sibling::text()[1])''')
dosage = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Dosage')]//..//following-sibling::text()[1])''')
indications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Why it is prescribed (Indications)')]//..//following-sibling::text()[1])''')
contraindications = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Contraindications')]//..//following-sibling::text()[1])''')
warnings = selector.xpath(r'''normalize-space(//div[#class="report-content"]//b/text()[contains(., 'Warnings and Precautions')]//..//following-sibling::text()[1])''')
drug_info['name'] = name.extract()
drug_info['dosage'] = dosage.extract()
drug_info['indications'] = indications.extract()
drug_info['contraindications'] = contraindications.extract()
drug_info['warnings'] = warnings.extract()
return drug_info
This would give you the following infos:
>scrapy parse --spider=testmed --verbose -d 2 -c parse_item --logfile C:\Python27\Scripts\Test\Test\spiders\test.log http://www.medindia.net/doctors/drug_information/Zafirlukast.htm
>>> DEPTH LEVEL: 1 <<<
# Scraped Items ------------------------------------------------------------
[{'contraindications': [u'Hypersensitivity.'],
'dosage': [u'Adult- The recommended dose is 20 mg twice daily.'],
'indications': [u'This medication is an oral leukotriene receptor antagonist (
LTRA), prescribed for asthma. \xa0It blocks the action of certain natural substa
nces that cause swelling and tightening of the airways.'],
'name': [u'\xa0Zafirlukast'],
'warnings': [u'Caution should be exercised in patients with history of liver d
isease, mental problems, suicidal thoughts, any allergy, elderly, during pregnan
cy and breastfeeding.']}]

Categories

Resources