Python scrapy to extract specific Xpath fields - python

I am having following structure (sample). i am using scrapy to extract the details. I need to extract the fields of 'href' and text like 'Accounting'. I am using the following code. I am new to Xpath. any help to extarct the specific fields .
<div class = 'something'>
<ul>
<li>Accounting</li>
<li>Administrative</li>
<li>Advertising</li>
<li>Airline</li>
</ul>
</div>
My code is:
from scrapy.spider import BaseSpider
from jobfetch.items import JobfetchItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
class JobFetchSpider(BaseSpider):
"""Spider for regularly updated livingsocial.com site, San Francisco Page"""
name = "Jobsearch"
allowed_domains = ["jobsearch.about.com/"]
start_urls = ['http://jobsearch.about.com/od/job-titles/fl/job-titles-a-z.htm']
def parse(self, response):
count = 0
for sel in response.xpath('//*[#id="main"]/div/div[2]/div[1]/div/div[2]/article/div[2]/ul[1]'):
item = JobfetchItem()
item['title'] = sel.extract()
item['link'] = sel.extract()
count = count+1
print item
yield item

The problems you have in the code:
yield item should be inside the loop since you are instantiating items there
the xpath you have is pretty messy and not quite reliable since it heavily relies on the elements location inside parent tags and starts from almost the top parent of the document
your xpath is incorrect - it should go down to the a elements inside li inside ul
sel.extract() would only give you that ul element extracted
For the sake of an example, use a CSS selector here to get to the li tags:
import scrapy
from jobfetch.items import JobfetchItem
class JobFetchSpider(scrapy.Spider):
name = "Jobsearch"
allowed_domains = ["jobsearch.about.com/"]
start_urls = ['http://jobsearch.about.com/od/job-titles/fl/job-titles-a-z.htm']
def parse(self, response):
for sel in response.css('article[itemprop="articleBody"] div.expert-content-text > ul > li > a'):
item = JobfetchItem()
item['title'] = sel.xpath('text()').extract()[0]
item['link'] = sel.xpath('#href').extract()[0]
yield item
Running the spider produces:
{'link': u'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm', 'title': u'Accounting'}
{'link': u'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm', 'title': u'Administrative'}
...
{'link': u'http://jobsearch.about.com/od/job-title-samples/fl/yacht-job-titles.htm', 'title': u'Yacht Jobs'}
FYI, we could have used xpath() also:
//article[#itemprop="articleBody"]//div[#class="expert-content-text"]/ul/li/a

Use the below script to extract the data as you want to scrape.
In [1]: response.xpath('//div[#class="expert-content-text"]/ul/li/a/text()').extract()
Out[1]:
[u'Accounting',
u'Administrative',
u'Advertising',
u'Airline',
u'Animal',
u'Alternative Energy',
u'Auction House',
u'Banking',
u'Biotechnology',
u'Business',
u'Business Intelligence',
u'Chef',
u'College Admissions',
u'College Alumni Relations and Development ',
u'College Student Services',
u'Construction',
u'Consulting',
u'Corporate',
u'Cruise Ship',
u'Customer Service',
u'Data Science',
u'Engineering',
u'Entry Level Jobs',
u'Environmental',
u'Event Planning',
u'Fashion',
u'Film',
u'First Job',
u'Fundraiser',
u'Healthcare/Medical',
u'Health/Safety',
u'Hospitality',
u'Human Resources',
u'Human Services / Social Work',
u'Information Technology (IT)',
u'Insurance',
u'International Affairs / Development',
u'International Business',
u'Investment Banking',
u'Law Enforcement',
u'Legal',
u'Maintenance',
u'Management',
u'Manufacturing',
u'Marketing',
u'Media',
u'Museum',
u'Music',
u'Non Profit',
u'Nursing',
u'Outdoor ',
u'Public Administration',
u'Public Relations',
u'Purchasing',
u'Radio',
u'Real Estate ',
u'Restaurant',
u'Retail',
u'Sales',
u'School',
u'Science',
u'Ski and Snow Jobs',
u'Social Media',
u'Social Work',
u'Sports',
u'Television',
u'Trades',
u'Transportation',
u'Travel',
u'Yacht Jobs']
In [1]: response.xpath('//div[#class="expert-content-text"]/ul/li/a/#href').extract()
Out[2]:
[u'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm',
u'http://jobsearch.about.com/od/job-titles/a/advertising-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/airline-industry-jobs.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/animal-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/alternative-energy-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/auction-house-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/banking-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/biotechnology-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/business-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/business-intelligence-job-titles.htm',
u'http://culinaryarts.about.com/od/culinaryfundamentals/a/whatisachef.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-admissions-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-alumni-relations-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-student-service-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/construction-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/consulting-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/c-level-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/cruise-ship-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/customer-service-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/data-science-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/engineering-job-titles.htm',
u'http://jobsearch.about.com/od/best-jobs/a/best-entry-level-jobs.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/environmental-job-titles.htm',
u'http://eventplanning.about.com/od/eventcareers/tp/corporateevents.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/fashion-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/film-job-titles.htm',
u'http://jobsearch.about.com/od/justforstudents/a/first-job-list.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/fundraiser-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/health-care-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/health-safety-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/hospitality-job-titles.htm',
u'http://humanresources.about.com/od/HR-Roles-And-Responsibilities/fl/human-resources-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/human-services-social-work-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/it-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/insurance-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/international-affairs-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/international-business-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/investment-banking-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/law-enforcement-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/legal-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/maintenance-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/management-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/manufacturing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/marketing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/media-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/museum-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/music-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/nonprofit-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/nursing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/outdoor-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/public-administration-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/public-relations-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/purchasing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/radio-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/real-estate-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/restaurant-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/retail-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/sales-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/high-school-middle-school-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/science-job-titles.htm',
u'http://jobsearch.about.com/od/skiandsnowjobs/a/skijob2_2.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/social-media-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/social-work-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/sports-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/television-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/trades-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/transportation-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/travel-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/yacht-job-titles.htm']

Related

XPATH issue while looping through tags

I have this piece of code, where I try to download these papers but the loop prints the first element only.
import scrapy
from urllib.parse import urljoin
class SimpleSpider(scrapy.Spider):
name = 'simple'
start_urls = ['https://jmedicalcasereports.biomedcentral.com/articles?query=COVID-19&searchType=journalSearch&tab=keyword']
def parse(self, response):
for book in response.xpath('//*[#id="main-content"]/div/main/div[2]/ol'):
title= response.xpath('/li[3]/article/h3/a/text()').get()
link = urljoin(
'https://jmedicalcasereports.biomedcentral.com/',response.xpath('/li[3]/article/ul/li[2]/a/#href').get()
)
yield {
'Title':title,
'file_urls':[link]
}
I used css, and then xpath, problem is with loop code.
Firstly, in the third line of your code, response could be changed to title
title= book.xpath('.//a/text()').get()
Secondly, in your second line, you give an incorrect xpath. So the result is not correct. This is my code. Hope this can help you.
def parse(self, response):
for book in response.xpath('//li[#class = "c-listing__item"]'):
title= book.xpath('.//a/text()').get()
link = urljoin(
'https://jmedicalcasereports.biomedcentral.com/',book.xpath('.//a/#href').get()
)
yield {
'Title':title,
'file_urls':[link]
}
The response is :
{'Title': 'Presentation of COVID-19 infection with bizarre behavior and
encephalopathy: a case report', 'file_urls':
['https://jmedicalcasereports.biomedcentral.com/articles/10.1186/s13256-021-
02851-0']}
2022-04-17 21:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200
https://jmedicalcasereports.biomedcentral.com/articles?query=COVID-
19&searchType=journalSearch&tab=keyword>
{'Title': 'Dysentery as the only presentation of COVID-19 in a child: a\xa0case
report', 'file_urls':
['https://jmedicalcasereports.biomedcentral.com/articles/10.1186/s13256-021-
02672-1']}

Need to Extract contents of subpages using scrapy

I'm fairly new to scrapy but have made a few simple scrapers work for me.
I'm trying to go to the next level by getting all the links from one page and scraping the content of the subpages. I've read up a few different examples and Q&As but can't seem to get this code to work for me.
import scrapy
from ..items import remoteworkhub_jobs
class remoteworkhub(scrapy.Spider):
name = 'remoteworkhub'
allowed_domains = ['www.remoteworkhub.com']
#start_urls = ['https://jobs.remoteworkhub.com/']
start_urls = ['https://jobs.remoteworkhub.com']
# Scrape the individual job urls and pass them to the spider
def parse(self, response):
links = response.xpath('//a[#class="jobList-title"]/#href').extract()
for jobs in links:
base_url = 'https://jobs.remoteworkhub.com'
Url = base_url + jobs
yield scrapy.Request(Url, callback=self.parsejobpage)
def parsejobpage(self, response):
#Extracting the content using css selectors
titles = response.xpath('//h1[#class="u-mv--remove u-textH2"]/text()').extract()
companys = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div/div[1]/strong/a/text()').extract()
categories = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[3]/ul/li/a/text()').extract()
worktype = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[5]/div[2]/span/text()').extract()
job_decription = response.xpath('//div[#class="job-body"]//text()').extract()
#titles = response.css('.jobDetail-headerIntro::text').extract()
#titles = response.xpath('//title').get()
#votes = response.css('.score.unvoted::text').extract()
#times = response.css('time::attr(title)').extract()
#comments = response.css('.comments::text').extract()
item = remoteworkhub_jobs()
#item['jobUrl'] = jobUrl
item['title'] = titles
#item['company'] = companys
#item['category'] = categories
#item['worktype'] = worktype
#item['job_description'] = job_decription
#yield or give the scraped info to scrapy
yield item
Check out the following implementation which should let you parse job title and their concerning company names from that site. The way you have defined xpaths are error prone. However, I've modified them so that they can work in the right way. Give it a shot:
import scrapy
class remoteworkhub(scrapy.Spider):
name = 'remoteworkhub'
start_urls = ['https://jobs.remoteworkhub.com']
def parse(self, response):
for job_link in response.xpath("//*[contains(#class,'job-listing')]//*[#class='jobList-title']/#href").extract():
Url = response.urljoin(job_link)
yield scrapy.Request(Url, callback=self.parsejobpage)
def parsejobpage(self, response):
d = {}
d['title'] = response.xpath("//*[#class='jobDetail-headerIntro']/h1/text()").get()
d['company'] = response.xpath("//*[#class='jobDetail-headerIntro']//strong//text()").get()
yield d
This is the kind of output I can see in the console if I use print instead of yield:
{'title': 'Sr Full Stack Developer, Node/React - Remote', 'company': 'Clevertech'}
{'title': 'Subject Matter Expert, Customer Experience - Remote', 'company': 'Qualtrics'}
{'title': 'Employee Experience Enterprise Account Executive - Academic and Government - Remote', 'company': 'Qualtrics'}
{'title': 'Senior Solutions Consultant, Brand Experience - Remote', 'company': 'Qualtrics'}
{'title': 'Data Analyst - Remote', 'company': 'Railsware'}
{'title': 'Recruitment Manager - Remote', 'company': 'Railsware'}

Python Recursive Scraping with Scrapy

I'm trying to make a scraper that will pull links, titles, prices and the body of posts on craigslist. I have been able to get the prices, but it returns the price for every listing on the page, not just for the specific row. I am also unable to get it to go to the next page and continue scraping.
This is the tutorial I am using - http://mherman.org/blog/2012/11/08/recursively-scraping-web-pages-with-scrapy/
I've tried suggestions from this thread, but still can't make it work - Scrapy Python Craigslist Scraper
The page I'm trying to scrape is - http://medford.craigslist.org/cto/
In the link price variable, if I remove the // before span[#class="l2"] it returns no prices, but if I leave it there it includes every price on the page.
For the rules, I've tried playing with the class tags but it seems to hang on the first page. I'm thinking I might need separate spider classes?
Here is my code:
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: CD
#
# Created: 02/03/2014
# Copyright: (c) CD 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
import sys
class PageSpider(BaseSpider):
name = "cto"
allowed_domains = ["medford.craigslist.org"]
start_urls = ["http://medford.craigslist.org/cto/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//span[#class="button next"]' ,))
, callback="parse", follow=True), )
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"] | //span[#class="l2"]')
for title in titles:
item = CraigslistSampleItem()
item['title'] = title.select("a/text()").extract()
item['link'] = title.select("a/#href").extract()
item['price'] = title.select('//span[#class="l2"]//span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
The idea is simple: find all paragraphs in a div with a class="content". Then from every paragraph extract link, text link and a price. Note that select() method is deprecated currentlty, use xpath() instead.
Here's a modified version of parse() method:
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//div[#class="content"]/p[#class="row"]')
for row in rows:
item = CraigslistSampleItem()
link = row.xpath('.//span[#class="pl"]/a')
item['title'] = link.xpath("text()").extract()
item['link'] = link.xpath("#href").extract()
item['price'] = row.xpath('.//span[#class="l2"]/span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
This is a sample of what I'm getting:
{'description': [u"\n\t\tHave a nice, sturdy, compact car hauler/trailer. May be used for other hauling like equipstment, ATV's and the like, Very solid and in good shape. Parice to sell at only $995. Call Bill at 541 944 2929 top see or Roy at 541 9733421. \n\t"],
'link': [u'/cto/4354771900.html'],
'price': [u'$995'],
'title': [u'compact sturdy car trailer ']}
Hope that helps.

Python Scrapy for grabbing table columns and rows

I'm relatively a noob at python and it's my first time learning scrapy. I've done data mining with perl quite successfully before, but this is a whole different ballgame!
I'm trying to scrape a table, grab the columns of each row. My code is below.
items.py
from scrapy.item import Item, Field
class Cio100Item(Item):
company = Field()
person = Field()
industry = Field()
url = Field()
scrape.py (the spider)
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from cio100.items import Cio100Item
items = []
class MySpider(BaseSpider):
name = "scrape"
allowed_domains = ["cio.co.uk"]
start_urls = ["http://www.cio.co.uk/cio100/2013/cio/"]
def parse(self, response):
sel = Selector(response)
tables = sel.xpath('//table[#class="bgWhite listTable"]//h2')
for table in tables:
# print table
item = Cio100Item()
item['company'] = table.xpath('a/text()').extract()
item['person'] = table.xpath('a/text()').extract()
item['industry'] = table.xpath('a/text()').extract()
item['url'] = table.xpath('a/#href').extract()
items.append(item)
return items
I'm have some trouble understanding how to articulate the xpath selection correctly.
I think this line is the problem:
tables = sel.xpath('//table[#class="bgWhite listTable"]//h2')
When I run the scraper as is above the result is I get things like this in terminal:
2014-01-13 22:13:29-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/>
{'company': [u"\nDomino's Pizza\n"],
'industry': [u"\nDomino's Pizza\n"],
'person': [u"\nDomino's Pizza\n"],
'url': [u'/cio100/2013/dominos-pizza/']}
2014-01-13 22:13:29-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/>
{'company': [u'\nColin Rees\n'],
'industry': [u'\nColin Rees\n'],
'person': [u'\nColin Rees\n'],
'url': [u'/cio100/2013/dominos-pizza/']}
Ideally I want only one block, not two, with Domino's in the company slot, Colin in the person slot, and the industry grabbed, which it's not doing.
When I use firebug to inspect the table, I see h2 for columns 1 and 2 (company and person) but column 3 is h3?
When I modify the tables line to h3 at the end, as follows
tables = sel.xpath('//table[#class="bgWhite listTable"]//h3')
I get this
2014-01-13 22:16:46-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/>
{'company': [u'\nRetail\n'],
'industry': [u'\nRetail\n'],
'person': [u'\nRetail\n'],
'url': [u'/cio100/2013/dominos-pizza/']}
Here it only produces 1 block, and it's capturing Industry and the URL correctly. But it's not getting the company name or person.
Any help will be greatly appreciated!
Thanks!
as far as the xpath goes, consider doing something like:
$ scrapy shell http://www.cio.co.uk/cio100/2013/cio/
...
>>> for tr in sel.xpath('//table[#class="bgWhite listTable"]/tr'):
... item = Cio100Item()
... item['company'] = tr.xpath('td[2]//a/text()').extract()[0].strip()
... item['person'] = tr.xpath('td[3]//a/text()').extract()[0].strip()
... item['industry'] = tr.xpath('td[4]//a/text()').extract()[0].strip()
... item['url'] = tr.xpath('td[4]//a/#href').extract()[0].strip()
... print item
...
{'company': u'LOCOG',
'industry': u'Leisure and entertainment',
'person': u'Gerry Pennell',
'url': u'/cio100/2013/locog/'}
{'company': u'Laterooms.com',
'industry': u'Leisure and entertainment',
'person': u'Adam Gerrard',
'url': u'/cio100/2013/lateroomscom/'}
{'company': u'Vodafone',
'industry': u'Communications and IT services',
'person': u'Albert Hitchcock',
'url': u'/cio100/2013/vodafone/'}
...
other than that you better yield items one by one rather than accumulating them in a list

Scrapy: one row per item

I'm using scrapy to scrape category pages for products from architonic*com. However, I would like to display these products in a csv one per row. In the current situation all the brand names from a given category page are listed under 'brand' while I would like to have an output like this:
{'brand': [u'Elisabeth Ellefsen'],
'title': [u'Up chair I 907'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/4373/t-up-06f-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/up-chair-tonon/1174373']
}
I tried playing with the Item Loaders (added default_output_processor= TakeFirst()), adding 'yield item' (see commented code) and searched two days to find a solution without luck. Hoping someone is willing to help me. Any help is really appreciated.
My output looks like this:
2013-01-14 11:53:23+0100 [archi] DEBUG: Scraped from <200 http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/3>
{'brand': [u'Softline',
u'Elisabeth Ellefsen',
u'Sellex',
u'Lievore Altherr Molina',
u'Poliform',
.....
u'Hans Thyge & Co.'],
'img_url': [u'http://image.architonic.com/img_pro1-1/117/3661/terra-h-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/0852/fly-01-sq.jpg',
u'http://image.architonic.com/img_pro1-1/116/9870/ley-0004-sq.jpg',
u'http://image.architonic.com/img_pro1-1/117/1023/arflex-hollywood-03-sq.jpg',
...
u'http://image.architonic.com/img_pro1-1/118/5357/reef-002-sq.jpg'],
'link': [u'http://www.architonic.com/pmsht/terra-softline/1173661',
u'http://www.architonic.com/pmsht/fly-sellex/1170852',
u'http://www.architonic.com/pmsht/ley-poliform/1169870',
.....
u'http://www.architonic.com/pmsht/reef-collection-labofa/1185357'],
'title': [u'Terra',
u'Fly',
u'Ley chair',
.....
u'Hollywood Sofa',
u'Pouff Round']}
I'm using this in spider/archi_spider.py
import string
import re
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.utils.markup import remove_entities
from archiscraper.items import ArchiItemFields, ArchiLoader
class ArchiScraper(BaseSpider):
name = "archi"
allowed_domains = ["architonic.com"]
start_urls = ['http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/%s' % page for page in xrange(1, 4)]
# rules = (Rule(SgmlLinkExtractor(allow=('.', ),restrict_xpaths=('//*[#id="right_arrow"]',))
# , callback="parse_items", follow= True),
# )
#
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//li[contains(#class, "nav_pro_item")]')
items = []
for site in sites:
item = ArchiLoader(ArchiItemFields(), site)
item.add_xpath('brand', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]')
item.add_xpath('designer', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]')
item.add_xpath('title', '//*[contains(#class, "nav_pro_text")]/a/strong/text()')
item.add_xpath('img_url', '//li[contains(#class, "nav_pro_item")]/div/a/img/#src[1]')
item.add_xpath('link', '//*[contains(#class, "nav_pro_text")]/a/#href')
items.append(item.load_item())
return items
# for item in items:
# yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
import string
from scrapy.item import Item, Field
from scrapy.contrib.loader.processor import MapCompose, Join, TakeFirst
from scrapy.utils.markup import remove_entities
from scrapy.contrib.loader import XPathItemLoader
class ArchiItem():
pass
class ArchiItemFields(Item):
brand = Field()
title = Field()
designer = Field()
img_url = Field()
img = Field()
link = Field()
pass
class ArchiLoader(XPathItemLoader):
# default_input_processor = MapCompose(unicode.strip)
# default_output_processor= TakeFirst()
brand_out = MapCompose(unicode.strip)
# title_out = Join()

Categories

Resources