Im new to scrapy, but when I run my code the Debug returns with no errors and when I look at the amount of data it has scraped that should not be case? Below is my code. Im trying to get the reviews from tripadvisor.
import HTMLParser
import unicodedata
import re
import time
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.contrib.spiders import CrawlSpider, Rule
class scrapingtestSpider(CrawlSpider):
name = "scrapingtest"
allowed_domains = ["tripadvisor.com"]
base_uri = "http://www.tripadvisor.com"
start_urls = [
base_uri + "/RestaurantSearch?geo=60763&q=New+York+City%2C+New+York&cat=&pid="
]
htmlparser = HTMLParser.HTMLParser()
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def clean_parsed_string(string):
if len(string) > 0:
ascii_string = string
if is_ascii(ascii_string) == False:
ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
return str(ascii_string)
else:
return None
def get_parsed_string(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:
return_string = htmlparser.unescape(raw_string)
return return_string
def get_parsed_string_multiple(selector, xpath):
return_string = ''
return selector.xpath(xpath).extract()
def parse(self, response):
tripadvisor_items = []
sel = Selector(response)
snode_restaurants = sel.xpath('//div[#id="EATERY_SEARCH_RESULTS"]/div[starts-with(#class, "listing")]')
# Build item index.
for snode_restaurant in snode_restaurants:
# Cleaning string and taking only the first part before whitespace.
snode_restaurant_item_avg_stars = clean_parsed_string(get_parsed_string(snode_restaurant, 'div[#class="wrap"]/div[#class="entry wrap"]/div[#class="description"]/div[#class="wrap"]/div[#class="rs rating"]/span[starts-with(#class, "rate")]/img[#class="sprite-ratings"]/#alt'))
tripadvisor_item['avg_stars'] = re.match(r'(\S+)', snode_restaurant_item_avg_stars).group()
# Popolate reviews and address for current item.
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_search_page)
def parse_fetch_review(self, response):
tripadvisor_item = response.meta['tripadvisor_item']
sel = Selector(response)
counter_page_review = response.meta['counter_page_review']
# TripAdvisor reviews for item.
snode_reviews = sel.xpath('//div[#id="REVIEWS"]/div/div[contains(#class, "review")]/div[#class="col2of2"]/div[#class="innerBubble"]')
# Reviews for item.
for snode_review in snode_reviews:
tripadvisor_review_item = ScrapingtestreviewItem()
tripadvisor_review_item['title'] = clean_parsed_string(get_parsed_string(snode_review, 'div[#class="quote"]/text()'))
# Review item description is a list of strings.
# Strings in list are generated parsing user intentional newline. DOM: <br>
tripadvisor_review_item['description'] = get_parsed_string_multiple(snode_review, 'div[#class="entry"]/p/text()')
# Cleaning string and taking only the first part before whitespace.
snode_review_item_stars = clean_parsed_string(get_parsed_string(snode_review, 'div[#class="rating reviewItemInline"]/span[starts-with(#class, "rate")]/img/#alt'))
tripadvisor_review_item['stars'] = re.match(r'(\S+)', snode_review_item_stars).group()
snode_review_item_date = clean_parsed_string(get_parsed_string(snode_review, 'div[#class="rating reviewItemInline"]/span[#class="ratingDate"]/text()'))
snode_review_item_date = re.sub(r'Reviewed ', '', snode_review_item_date, flags=re.IGNORECASE)
snode_review_item_date = time.strptime(snode_review_item_date, '%B %d, %Y') if snode_review_item_date else None
tripadvisor_review_item['date'] = time.strftime('%Y-%m-%d', snode_review_item_date) if snode_review_item_date else None
tripadvisor_item['reviews'].append(tripadvisor_review_item)
Here's the DEBUG log
C:\Users\smash_000\Desktop\scrapingtest\scrapingtest>scrapy crawl scrapingtest -
o items.json
C:\Users\smash_000\Desktop\scrapingtest\scrapingtest\spiders\scrapingtest_spider
.py:6: ScrapyDeprecationWarning: Module `scrapy.spider` is deprecated, use `scra
py.spiders` instead
from scrapy.spider import BaseSpider
C:\Users\smash_000\Desktop\scrapingtest\scrapingtest\spiders\scrapingtest_spider
.py:9: ScrapyDeprecationWarning: Module `scrapy.contrib.spiders` is deprecated,
use `scrapy.spiders` instead
from scrapy.contrib.spiders import CrawlSpider, Rule
2015-07-14 11:07:04 [scrapy] INFO: Scrapy 1.0.1 started (bot: scrapingtest)
2015-07-14 11:07:04 [scrapy] INFO: Optional features available: ssl, http11
2015-07-14 11:07:04 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'sc
rapingtest.spiders', 'FEED_FORMAT': 'json', 'SPIDER_MODULES': ['scrapingtest.spi
ders'], 'FEED_URI': 'items.json', 'BOT_NAME': 'scrapingtest'}
2015-07-14 11:07:04 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter
, TelnetConsole, LogStats, CoreStats, SpiderState
2015-07-14 11:07:05 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddl
eware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultH
eadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMidd
leware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-07-14 11:07:05 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddlewa
re, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-07-14 11:07:05 [scrapy] INFO: Enabled item pipelines:
2015-07-14 11:07:05 [scrapy] INFO: Spider opened
2015-07-14 11:07:05 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i
tems (at 0 items/min)
2015-07-14 11:07:05 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-07-14 11:07:06 [scrapy] DEBUG: Crawled (200) <GET http://www.tripadvisor.co
m/RestaurantSearch?geo=60763&q=New+York+City%2C+New+York&cat=&pid=> (referer: No
ne)
2015-07-14 11:07:06 [scrapy] INFO: Closing spider (finished)
2015-07-14 11:07:06 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 281,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 46932,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 7, 14, 5, 37, 6, 929000),
'log_count/DEBUG': 2,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2015, 7, 14, 5, 37, 5, 474000)}
2015-07-14 11:07:06 [scrapy] INFO: Spider closed (finished)
Did you try to debug your code with print statements?
I tried to execute your parser. If I copy the provided code as is I get the same result because the spider class scrapingtestSpider has no parse method and it does not get called.
If I do some formatting of your code (I indent everything under start_urls to be in the class) I get some errors that helper-methods are not defined by their global name.
If I go further and leave only the to parse methods for the crawler I get other errors mentioning that tripadvisor_item is not defined.... So the code is not really working.
Try to format your code better in your IDE and add print messages to your parse methods to see if they get called or not. the main parse method should be entered when Scrapy crawls the first URL. I think it won't work.
And by the way the callback you add to the Request is named bad too:
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_search_page)
should be changed to
yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)
when you fix the indentation problems.
And at the end of the parse_fetch_review method return or yield the tripadvisor_item you created in the parse method.
Related
I would like to access and then extract the content's from a list of urls. For instance, consider this website, I would like to extract the content of each post. So, based in the posted answers I tried the following:
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
import urllib
class Test(scrapy.Spider):
name = "test"
allowed_domains = ["https://sfbay.craigslist.org/search/jjj?employment_type=2"]
start_urls = (
'https://sfbay.craigslist.org/search/jjj?employment_type=2',
)
def parse(self, response):
driver = webdriver.Firefox()
driver.get(response)
links = driver.find_elements_by_xpath('''.//a[#class='hdrlnk']''')
links = [x.get_attribute('href') for x in links]
for x in links:
print(x)
However, I do not understand how to scrap in a single movement all the content from a long list of links, without specifying the target urls... Any idea of how to do it?. I also try to something similar to this video, and I am still stuck....
UPDATE
Based in #quasarseeker answer I tried:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from test.items import TestItems
class TestSpider(CrawlSpider):
name = "test"
allowed_domains = ["https://sfbay.craigslist.org/search/jjj?employment_type=2"]
start_urls = (
'https://sfbay.craigslist.org/search/jjj?employment_type=2',
)
rules = ( # Rule to parse through all pages
Rule(LinkExtractor(allow=(), restrict_xpaths=("//a[#class='button next']",)),
follow=True),
# Rule to parse through all listings on a page
Rule(LinkExtractor(allow=(), restrict_xpaths=("/p[#class='row']/a",)),
callback="parse_obj", follow=True),)
def parse_obj(self, response):
item = TestItem()
item['url'] = []
for link in LinkExtractor(allow=(), deny=self.allowed_domains).extract_links(response):
item['url'].append(link.url)
print('\n\n\n\n**********************\n\n\n\n',item)
return item
However, I am not getting anything:
2016-11-03 08:46:24 [scrapy] INFO: Scrapy 1.2.0 started (bot: test)
2016-11-03 08:46:24 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'test.spiders', 'BOT_NAME': 'test', 'ROBOTSTXT_OBEY': True, 'SPIDER_MODULES': ['test.spiders']}
2016-11-03 08:46:24 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats', 'scrapy.extensions.corestats.CoreStats']
2016-11-03 08:46:24 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-11-03 08:46:24 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-11-03 08:46:24 [scrapy] INFO: Enabled item pipelines:
[]
2016-11-03 08:46:24 [scrapy] INFO: Spider opened
2016-11-03 08:46:24 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-11-03 08:46:24 [scrapy] DEBUG: Crawled (200) <GET https://sfbay.craigslist.org/robots.txt> (referer: None)
2016-11-03 08:46:25 [scrapy] DEBUG: Crawled (200) <GET https://sfbay.craigslist.org/search/jjj?employment_type=2> (referer: None)
2016-11-03 08:46:25 [scrapy] DEBUG: Filtered offsite request to 'sfbay.craigslist.org': <GET https://sfbay.craigslist.org/search/jjj?employment_type=2&s=100>
2016-11-03 08:46:25 [scrapy] INFO: Closing spider (finished)
2016-11-03 08:46:25 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 516,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 18481,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 11, 3, 14, 46, 25, 230629),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'offsite/domains': 1,
'offsite/filtered': 1,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 11, 3, 14, 46, 24, 258110)}
2016-11-03 08:46:25 [scrapy] INFO: Spider closed (finished)
I don't use Selenium (but BeautifulSoup) so there can be better solution.
You can get all a tags with class hdrlnk and then you can get href from this tags. Now you have list of all links and you can go to this pages and get content.
from selenium import webdriver
driver = webdriver.Firefox()
driver.get('https://sfbay.craigslist.org/search/jjj?employment_type=2')
# get all `a` with `class=hdrlnk`
links = driver.find_elements_by_xpath('.//a[#class="hdrlnk"]')
#links = driver.find_elements_by_css_selector('a.hdrlnk')
#links = driver.find_elements_by_class_name('hdrlnk')
# get all `href` from all `a`
links = [x.get_attribute('href') for x in links]
# visit pages
for x in links:
print(x)
# follow link
driver.get(x)
# ... here get page content ...
# ... EDIT ...
# ... using `elements` (with `s`) ...
#content = driver.find_elements_by_xpath('.//*[#id="postingbody"]')
#content = driver.find_elements_by_css_selector('#postingbody')
content = driver.find_elements_by_id('postingbody')
#print([x.text for x in content])
#print([x.text for x in content][0])
print(''.join([x.text for x in content]))
# ... using `element` (without `s`) ...
#content = driver.find_element_by_xpath('.//*[#id="postingbody"]')
#content = driver.find_element_by_css_selector('#postingbody')
content = driver.find_element_by_id('postingbody')
print(content.text)
This can be easily done with Scrapy. But you'll need to modify your rules and point the LinkExtractor to the xpath of all the pages you wish to scrape. In the web page you have provided in the example, it would look something like this:
rules = (Rule(LinkExtractor(allow=("//p[#class='row']/a")), \
callback='parse_obj', follow=True),)
This would enable the rules to parse through each listing which is contained in the xpath -
/p[#class='row']/a
and then call upon parse_obj().
I also did notice that the listings page has pagination as well. In case you're looking to parse through every single page of the listing, you would need to include a rule to first parse through the pagination buttons, then through the links on each page and then finally call upon your function. Your code final would look something like this:
rules = ( #Rule to parse through all pages
Rule (LinkExtractor(allow=(),restrict_xpaths=("//a[#class='button next']",)),
follow= True),
#Rule to parse through all listings on a page
Rule (LinkExtractor(allow=(),restrict_xpaths=("//p[#class='row']/a",)),
callback="parse_obj" , follow= True),)
I am following this topic to extract content from a website which has authentication. I have two versions of code, first one likes below
class FoodCrawler(InitSpider):
def parse(self, response):
pass
name = "theCrawler"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com"]
login_page = 'http://example.com/login'
def __init__(self, user, password, *args, **kwargs):
super(FoodCrawler, self).__init__(*args, **kwargs)
self.password = password
self.user = user
msg = 'The account will be used ' + user + ' ' + password
self.log(msg, level=logging.INFO)
def init_request(self):
"""This function is called before crawling starts."""
msg = {'email': self.user, 'password': self.password,
'reCaptchaResponse': '', 'rememberMe': 'true'}
headers = {'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/json'}
yield Request(self.login_page, method='POST', body=json.dumps(msg), headers=headers,
callback=self.check_login_response)
def check_login_response(self, response):
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if json.loads(response.body)['isSuccess']:
self.log("Successfully logged in!")
self.initialized(response)
else:
self.log("Bad times :(")
def initialized(self, response=None):
self.log("initialized")
for url in self.start_urls:
yield self.make_requests_from_url(url)
in the second version, I just change initialized function, remaining is similar
def initialized(self, response=None):
self.log("initialized")
The difference is 1st version may embrace more functions while second one doesn't. Please see more (*) for your details. To demonstrate, please take a look at self.log("initialized"), I want to show 1st version doesn't work properly. Thus, when I run, first version can't show message DEBUG: initialized as self.log("initialized") as the second version does
The full log is yielded by first version was
2016-01-05 16:05:38 [scrapy] INFO: Scrapy 1.0.3 started (bot: MySpider)
2016-01-05 16:05:38 [scrapy] INFO: Optional features available: ssl, http11, boto
2016-01-05 16:05:38 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'MySpider.spiders', 'SPIDER_MODULES': ['MySpider.spiders'], 'CONCURRENT_REQUESTS': 4, 'BOT_NAME': 'MySpider'}
2016-01-05 16:05:39 [scrapy] INFO: Enabled extensions: CloseSpider, TelnetConsole, LogStats, CoreStats, SpiderState
2016-01-05 16:05:39 [theCrawler] INFO: The account will be used username#gmail.com 123456789
2016-01-05 16:05:39 [py.warnings] WARNING: /usr/lib/python2.7/site-packages/scrapy/utils/deprecate.py:155: ScrapyDeprecationWarning: `scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware` class is deprecated, use `scrapy.downloadermiddlewares.useragent.UserAgentMiddleware` instead
ScrapyDeprecationWarning)
2016-01-05 16:05:39 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, RotateUserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2016-01-05 16:05:39 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2016-01-05 16:05:39 [scrapy] INFO: Enabled item pipelines:
2016-01-05 16:05:39 [scrapy] INFO: Spider opened
2016-01-05 16:05:39 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-01-05 16:05:39 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-01-05 16:05:39 [scrapy] DEBUG: Crawled (200) <POST http://www.example.com/login> (referer: None)
2016-01-05 16:05:39 [theCrawler] DEBUG: Successfully logged in!
2016-01-05 16:05:39 [scrapy] INFO: Closing spider (finished)
2016-01-05 16:05:39 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 494,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 1187,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 1, 5, 9, 5, 39, 363402),
'log_count/DEBUG': 3,
'log_count/INFO': 8,
'log_count/WARNING': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2016, 1, 5, 9, 5, 39, 168955)}
2016-01-05 16:05:39 [scrapy] INFO: Spider closed (finished)
I would like to know why, could you please give any advice? Thank you in advance
[Updated]
import json, pdb, logging
from scrapy import Request
from scrapy.spiders.init import InitSpider
(*) the initialized function embraces more function such as self.my_requests() but this function doesn't work. Indeed, the script doesn't run into self.my_requests()
def initialized(self, response=None):
self.log("initialized")
self.my_requests()
def my_requests(self):
self.log("my_requests")
pdb.set_trace()
for url in self.start_urls:
yield self.make_requests_from_url(url)
I've been getting blank json files despite successfully being able to execute most of the lines in scrapy shell.
When I run the command scrapy crawl courses with my courses bot being:
from scrapy.spiders import CrawlSpider
from scrapy.linkextractors import LinkExtractor
from tutorial.items import CoursesItem
from bs4 import BeautifulSoup
import scrapy
class CoursesSpider(CrawlSpider):
name = 'courses'
allowed_domains = ['guide.berkeley.edu']
start_urls = ['http://guide.berkeley.edu/courses/ast',
]
def parse(self, response):
soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
items = []
for course_info, course_desc, course_req in zip(soup.find_all('p',class_='courseblocktitle'), \
soup.find_all('p', class_='courseblockdesc'), \
soup.find_all('div', class_='course-section')):
item = CoursesItem()
item['title'] = course_info.text
item['description'] = course_desc.text
item['requirements'] = course_req.text
yield items
and by settings.py being
BOT_NAME = 'courses'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0.3'
# ITEM_PIPELINES = {
# 'tutorial.pipelines.JsonExportPipeline': 300
# }
FEED_URI = 'output.json'
FEED_FORMAT = 'json'
As you can see in the commented section, I've also tried making a pipeline.
My pipeline file looks like this:
from scrapy import signals
from scrapy.exporters import JsonLinesItemExporter
class JsonExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_spider.json' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = JsonLinesItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
But I feel that might not be where the error lies although it's possible since I largely followed a couple of tutorials I found.
I used BeautifulSoup to simplify the way I select the items.
Last but not least, the terminal looks like this after I run it.
2015-08-07 23:58:44 [scrapy] INFO: Scrapy 1.0.1 started (bot: courses)
2015-08-07 23:58:44 [scrapy] INFO: Optional features available: ssl, http11
2015-08-07 23:58:44 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'tu
torial.spiders', 'FEED_URI': 'output.json', 'SPIDER_MODULES': ['tutorial.spiders
'], 'BOT_NAME': 'courses', 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv
:39.0) Gecko/20100101 Firefox/39.0.3', 'FEED_FORMAT': 'json'}
2015-08-07 23:58:44 [scrapy] INFO: Enabled extensions: CloseSpider, FeedExporter
, TelnetConsole, LogStats, CoreStats, SpiderState
2015-08-07 23:58:44 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddl
eware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultH
eadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMidd
leware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-08-07 23:58:44 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-08-07 23:58:44 [scrapy] INFO: Enabled item pipelines:
2015-08-07 23:58:44 [scrapy] INFO: Spider opened
2015-08-07 23:58:44 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 i
tems (at 0 items/min)
2015-08-07 23:58:44 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6024
2015-08-07 23:58:45 [scrapy] DEBUG: Redirecting (301) to <GET http://guide.berke
ley.edu/courses/ast/> from <GET http://guide.berkeley.edu/courses/ast>
2015-08-07 23:58:45 [scrapy] DEBUG: Crawled (200) <GET http://guide.berkeley.edu
/courses/ast/> (referer: None)
2015-08-07 23:58:45 [scrapy] INFO: Closing spider (finished)
2015-08-07 23:58:45 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 537,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 22109,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 8, 8, 6, 58, 45, 600000),
'log_count/DEBUG': 3,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2015, 8, 8, 6, 58, 44, 663000)}
2015-08-07 23:58:45 [scrapy] INFO: Spider closed (finished)
I've ran most of my options thoroughly. Running the singular option of --parse tells me that I'm off in parsing the items, but even then, I'd like to know where to go beyond the parse bug fix (ie outputting to json). Ultimately, I want to pipe all this data into a database.
I know it's a lot to look through, but any help is appreciated, thanks!
You write a wrong word. In parse function change items -> item.
def parse(self, response):
soup = BeautifulSoup(response.body_as_unicode(), 'lxml')
items = []
for ...
item = CoursesItem()
item['title'] = course_info.text
item['description'] = course_desc.text
item['requirements'] = course_req.text
yield items # -> item
`
I have what I think is a correct implementation of a spider that overrides two in built functions.
parse_start_url() and parse()
When I run the spider with the custom overridden parse() function commented out.
The spider runs fine, aggregates links using SgmlLinkExtractor, all ok.
But when I uncomment the custom parse() function the spider runs without error, but there is no output, so it must be the handling of requests and responses between functions. Ok sure.
I have actually spent a few too many hours trying to get this to work, using different approaches with overriding functions/ using InitSpider/BaseSpider structures etc. nothing ever seems to set cookies correctly.
I am on version 0.16.4 which is old, so perhaps there's an issue there?
* SOLVED *
Nevermind, I just solved it with a deep breath and a little bit of luck.
Revisited the approach of 'no middleware', using CrawlSpider, SgmlLinkExtractor() & overriding make_requests_from_url()
So I removed the block of code that was supposed to override parse(),
and added this:
def make_requests_from_url(self, url):
request = Request(url, cookies = {'somedomain.com.au+2':'national'}, dont_filter=True)
return request
SPIDER:
from scrapy.contrib.exporter import JsonItemExporter
from scrapy.contrib.spiders import Rule,CrawlSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.shell import inspect_response
from scrapy.http.cookies import CookieJar
from TM.items import TMItem
import json
import time
import datetime
import re
import sys
import os
COOKIES_DEBUG = True
COOKIES_ENABLED = True
SPIDER_NAME = "TKComAuSpider"
SPIDER_VERSION = "1.0"
class TKComAuSpider(CrawlSpider):
name = "TKComAuMusicSpecific"
allowed_domains = ["domain.com.au"]
global response_urls
response_urls = []
global site_section_category
global master_items
master_items = []
start_urls = [ "http://some.domain.com.au/shows/genre.aspx?c=2048" ]
rules = (Rule (SgmlLinkExtractor(allow=(".*page=[0-9]+.*", ),
restrict_xpaths=('//*[#id="ctl00_uiBodyMain_searchResultsControl_uiPaginateBottom_List"]/ul/li',))
, callback="parse_it", follow = True),
)
def parse(self, response):
request_with_cookies = Request(url=self.start_urls[0],cookies={'domain.com.au+2':'national'})
print '\n\n' + request_with_cookies.url + '\n\n'
yield request_with_cookies
def parse_start_url(self, response):
list(self.parse_it(response))
def parse_it(self, response):
spider_name = "TKComAuMusicSpecific"
doc_date = datetime.datetime.now().strftime("%d-%m-%y-%H:%M")
items = []
hxs = HtmlXPathSelector(response)
# RESPONSE ASSIGNMENT #
response_url = response.url
response_urls.append(response_url)
# cl = response.headers.getlist('Cookie')
# if cl:
# msg = "Sending cookies to: %s" % response_url + os.linesep
# msg += os.linesep.join("Cookie: %s" % c for c in cl)
# log.msg(msg, spider=spider, level=log.DEBUG)
# CUSTOM SITE_SECTION TO CREATE SPIDER CAT FROM RESPONSE_URL #
site_section_category = re.sub(r'^.*//[a-zA-Z0-9._-]+([^.?]+).*$',r'\1', response.url).title().replace('/', '')
spider_category = "TKTerms" + site_section_category
file_name = 'out/' + spider_category + ".out"
with open("log/response.log", 'a') as l:
l.write(doc_date + ' ' + ' spider: ' + spider_name + '\nresponse_url: ' + response_url
+ '\nsite_section_category: ' + site_section_category
+ '\nspider_category: ' + spider_category + '\n')
f = open(file_name, 'w')
for site in hxs.select('//*[#class="contentEvent"]'):
link = site.select('h6/a/#href').extract()
title = site.select('h6/a/text()').extract()
f.write("%s\n" % title)
master_items.append({"title":title[0],"item_type":spider_category})
yield TMItem(title=title[0],item_type=spider_category)
f.close()
json_out = 'json/' + spider_name + '.json'
f = open(json_out, 'w')
final_json = (json.dumps({"docs": [{"spider_name": SPIDER_NAME, "spider_version": SPIDER_VERSION},
{"doc_title": spider_name, "doc_date": doc_date,
"urls": response_urls}, master_items]}))
f.write(final_json)
f.close()
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware, CookiesMiddleware, HttpCompressionMiddleware, ChunkedTransferMiddleware, DownloaderStats
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Enabled item pipelines: JsonWriterPipelineLines, JsonWriterPipeline
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] INFO: Spider opened
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6024
2014-04-30 13:15:46+1000 [scrapy] DEBUG: Web service listening on 0.0.0.0:6081
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] DEBUG: Redirecting (302) to <GET http://www.some.com.au/detection.aspx?rt=http%3a%2f%2fsome.domain.com.au%2fshows%2fgenre.aspx%3fc%3d2048> from <GET http://some.domain.com.au/shows/genre.aspx?c=2048>
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] DEBUG: Redirecting (302) to <GET http://some.domain.com.au/shows/genre.aspx?c=2048> from <GET http://www.some.com.au/detection.aspx?rt=http%3a%2f%2fsome.domain.com.au%2fshows%2fgenre.aspx%3fc%3d2048>
2014-04-30 13:15:46+1000 [TKComAuMusicSpecific] DEBUG: Crawled (200) <GET http://some.domain.com.au/shows/genre.aspx?c=2048> (referer: None)
http://some.domain.com.au/shows/genre.aspx?c=2048
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] DEBUG: Crawled (200) <GET http://some.domain.com.au/shows/genre.aspx?c=2048> (referer: http://some.domain.com.au/shows/genre.aspx?c=2048)
http://some.domain.com.au/shows/genre.aspx?c=2048
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] INFO: Closing spider (finished)
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1260,
'downloader/request_count': 4,
'downloader/request_method_count/GET': 4,
'downloader/response_bytes': 146364,
'downloader/response_count': 4,
'downloader/response_status_count/200': 2,
'downloader/response_status_count/302': 2,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 4, 30, 3, 15, 47, 108720),
'log_count/DEBUG': 10,
'log_count/INFO': 4,
'request_depth_max': 2,
'response_received_count': 2,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2014, 4, 30, 3, 15, 46, 220003)}
2014-04-30 13:15:47+1000 [TKComAuMusicSpecific] INFO: Spider closed (finished)
mainfile
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from bloggerx.items import BloggerxItem
from scrapy.spider import BaseSpider
class BloggerxSpider(BaseSpider):
name = 'bloggerx'
allowed_domains = ['abcr.com']
start_urls = ['http://www.abcr.com/profile/07372831905432746031']
def parse(self,response):
hxs = HtmlXPathSelector(response)
item = BloggerxItem()
item['gender'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Gender")]/following-sibling::node()/text()').extract()
item['blogger_since'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[2]/text()').re('\d+')
item['profile_views'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[3]/text()').re('\d+')
item['industry'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Industry")]/following-sibling::node()/span/a/text()').extract()
item['occupation'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Occupation")]/following-sibling::node()/span/a/text()').extract()
item['locality'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[#class="locality"]/a/text()').extract()
item['region'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[#class="region"]/a/text()').extract()
item['country'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[#class="country-name"]/a/text()').extract()
item['introduction'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Introduction")]/following-sibling::node()/text()').extract()
item['interests'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Interests")]/following-sibling::node()/span/a/text()').extract()
item['email1'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li/script/text()').re('[\w.]+#[\w.]+[com]')
item['email2'] = hxs.select('/html/body/div[2]/div/div[2]/div/ul/li[3]/div/text()').extract()
item['website'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li[2]/a/#href').extract()
item['films'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Films")]/following-sibling::node()/span/a/text()').extract()
item['music'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Music")]/following-sibling::node()/span/a/text()').extract()
item['books'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Books")]/following-sibling::node()/span/a/text()').extract()
item['blogs_follow'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/text()').extract()
item['blogs_follow_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/#href').extract()
item['author_blogs'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/text()').extract()
item['author_blogs_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/#href').extract()
return item
item file
from scrapy.item import Item, Field
class BloggerxItem(Item):
# define the fields for your item here like:
# name = Field()
gender = Field()
blogger_since = Field()
profile_views = Field()
industry = Field()
occupation = Field()
locality = Field()
introduction = Field()
interests = Field()
email1 = Field()
website = Field()
films = Field()
music = Field()
books = Field()
region = Field()
country = Field()
email2 = Field()
blogs_follow = Field()
blogs_follow_link = Field()
author_blogs = Field()
author_blogs_link = Field()
pass
output when I run : scrapy crawl bloggerx -o items.json -t json
2013-03-07 16:39:24+0530 [scrapy] INFO: Scrapy 0.16.4 started (bot: bloggerx)
2013-03-07 16:39:24+0530 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, RedirectMiddleware, CookiesMiddleware, HttpCompressionMiddleware, ChunkedTransferMiddleware, DownloaderStats
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Enabled item pipelines:
2013-03-07 16:39:25+0530 [bloggerx] INFO: Spider opened
2013-03-07 16:39:25+0530 [bloggerx] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6028
2013-03-07 16:39:25+0530 [scrapy] DEBUG: Web service listening on 0.0.0.0:6085
2013-03-07 16:39:27+0530 [bloggerx] DEBUG: Crawled (200) <GET http://www.abcr.com/profile/07372831905432746031> (referer: None)
2013-03-07 16:39:27+0530 [bloggerx] INFO: Closing spider (finished)
2013-03-07 16:39:27+0530 [bloggerx] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 249,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 13459,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 3, 7, 11, 9, 27, 320389),
'log_count/DEBUG': 7,
'log_count/INFO': 4,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2013, 3, 7, 11, 9, 25, 967450)}
2013-03-07 16:39:27+0530 [bloggerx] INFO: Spider closed (finished)
Generated output file is empty and individual hxs.select statements when tried on scrapy shell works fine . Is there something silly I am doing?
it seems to be late . But recently i have learnt the scrapy , as far my research goes ...
You are calling crawl spider from header and using base spider
error:
from scrapy.contrib.spiders import CrawlSpider, Rule
class BloggerxSpider(BaseSpider):
after correction:
from scrapy.contrib.spiders import **CrawlSpider**, Rule
class BloggerxSpider(**CrawlSpider**):
OR
from scrapy.spider import BaseSpider
class BloggerxSpider(BaseSpider):
Instead def parse_blogger you need to put def parse.
def parse is default one for parsing in framework, and if you want to name it different
you need to send your responses to that new one.
For using your own parse method you need to call is as callback, this is example when you create your own Request:
request = Request("http://something", callback=self.parse_blogger)
If you're not explicitly defining rules and don't care about following links, then use a BaseSpider instead, but keep your callback named as parse.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from bloggerx.items import BloggerxItem
class BloggerxSpider(BaseSpider):
...
Note that for CrawlSpiders, the documentation explicitly states that you should not name your callback parse, as that will override CrawlSpider's parse method and the spider will not crawl correctly.
Your log output seems weird to me as there is no entry for your start_urls, which the server responds with a 404 which Scrapy will ignore by default, so no Items will be returned. Also your spider does not declare BaseSpider which means this code will not even compile, so it seems there are some copy/paste issues going on here.
EDIT------------------
I changed the domain to blogger.com and now it returns one Item:
2013-03-08 09:02:28-0600 [scrapy] INFO: Scrapy 0.17.0 started (bot: oneoff)
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'oneoff.spiders', 'SPIDER_MODULES': ['oneoff.spiders'], 'USER_AGENT': 'Chromium OneOff 24.0.1312.56 Ubuntu 12.04 (24.0.1312.56-0ubuntu0.12.04.1)', 'BOT_NAME': 'oneoff'}
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Enabled item pipelines:
2013-03-08 09:02:28-0600 [bloggerx] INFO: Spider opened
2013-03-08 09:02:28-0600 [bloggerx] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6024
2013-03-08 09:02:28-0600 [scrapy] DEBUG: Web service listening on 0.0.0.0:6081
2013-03-08 09:02:28-0600 [bloggerx] DEBUG: Crawled (200) <GET http://www.blogger.com/profile/07372831905432746031> (referer: None)
2013-03-08 09:02:28-0600 [bloggerx] DEBUG: Scraped from <200 http://www.blogger.com/profile/07372831905432746031>
{'author_blogs': [u'Inserire comunicati stampa per il turismo',
u'Inserire Comunicati stampa e Article Marketing',
u'Video Quacos'],
'author_blogs_link': [u'http://comunicati-stampa-per-il-turismo.blogspot.com/',
u'http://comunicati-stampa-vendita-online.blogspot.com/',
u'http://quacos.blogspot.com/'],
'blogger_since': [u'2008'],
'blogs_follow': [u'Abandonware Time',
u'AltroSeo.com',
u'ANSIMA notizie',
u'Cinnamon Girl',
u'enigmamigarun',
u'Fake Books - Libri di una riga.',
u'FM - COSMETICA E NON SOLO ',
u'GS BARBARIANS',
u'Il Disinformatico',
u'Linus' blog',
u'Montefeltro Nuoto Master',
u'Nella Tana del Coniglio',
u'PHP and tips'],
'blogs_follow_link': [u'http://squakenet.blogspot.com/',
u'http://www.altroseo.com/',
u'http://ansima.blogspot.com/',
u'http://cinnamongirl82.blogspot.com/',
u'http://enigmaamigarun.blogspot.com/',
u'http://fake-books.blogspot.com/',
u'http://valeriacosmeticafm.blogspot.com/',
u'http://gsbarbarians.blogspot.com/',
u'http://attivissimo.blogspot.com/',
u'http://torvalds-family.blogspot.com/',
u'http://montefeltronuotomaster.blogspot.com/',
u'http://anonimoconiglio.blogspot.com/',
u'http://phpntips.blogspot.com/'],
'books': [],
'country': [],
'email1': [u'bloggiovanni.cappellini#gmail.com'],
'email2': [u'cappogio#hotmail.com'],
'films': [],
'gender': [],
'industry': [],
'interests': [],
'introduction': [],
'locality': [],
'music': [],
'occupation': [],
'profile_views': [u'553'],
'region': [],
'website': [u'http://www.quacos.com']}
2013-03-08 09:02:28-0600 [bloggerx] INFO: Closing spider (finished)
2013-03-08 09:02:28-0600 [bloggerx] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 288,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 13615,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 3, 8, 15, 2, 28, 948533),
'item_scraped_count': 1,
'log_count/DEBUG': 9,
'log_count/INFO': 4,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2013, 3, 8, 15, 2, 28, 379242)}
2013-03-08 09:02:28-0600 [bloggerx] INFO: Spider closed (finished)
Spider:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from bloggerx.items import BloggerxItem
class BloggerxSpider(BaseSpider):
name = 'bloggerx'
allowed_domains = ['blogger.com']
start_urls = ['http://www.blogger.com/profile/07372831905432746031']
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = BloggerxItem()
item['gender'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Gender")]/following-sibling::node()/text()').extract()
item['blogger_since'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[2]/text()').re('\d+')
item['profile_views'] = hxs.select('/html/body/div[2]/div/div[2]/div/p[3]/text()').re('\d+')
item['industry'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Industry")]/following-sibling::node()/span/a/text()').extract()
item['occupation'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Occupation")]/following-sibling::node()/span/a/text()').extract()
item['locality'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[#class="locality"]/a/text()').extract()
item['region'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[#class="region"]/a/text()').extract()
item['country'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Location")]/following-sibling::node()/span[#class="country-name"]/a/text()').extract()
item['introduction'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Introduction")]/following-sibling::node()/text()').extract()
item['interests'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Interests")]/following-sibling::node()/span/a/text()').extract()
item['email1'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li/script/text()').re('[\w.]+#[\w.]+[com]')
item['email2'] = hxs.select('/html/body/div[2]/div/div[2]/div/ul/li[3]/div/text()').extract()
item['website'] = hxs.select('//html/body/div[2]/div/div[2]/div/ul/li[2]/a/#href').extract()
item['films'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Films")]/following-sibling::node()/span/a/text()').extract()
item['music'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Music")]/following-sibling::node()/span/a/text()').extract()
item['books'] = hxs.select('//html/body/div[2]/div/div[3]/table/tr/th[contains(text(),"Favourite Books")]/following-sibling::node()/span/a/text()').extract()
item['blogs_follow'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/text()').extract()
item['blogs_follow_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul[2]/li/a/#href').extract()
item['author_blogs'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/text()').extract()
item['author_blogs_link'] = hxs.select('//html/body/div[2]/div/div[3]/ul/li/span/a/#href').extract()
return item