Python scrapy to extract specific Xpath fields - python
I am having following structure (sample). i am using scrapy to extract the details. I need to extract the fields of 'href' and text like 'Accounting'. I am using the following code. I am new to Xpath. any help to extarct the specific fields .
<div class = 'something'>
<ul>
<li>Accounting</li>
<li>Administrative</li>
<li>Advertising</li>
<li>Airline</li>
</ul>
</div>
My code is:
from scrapy.spider import BaseSpider
from jobfetch.items import JobfetchItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
class JobFetchSpider(BaseSpider):
"""Spider for regularly updated livingsocial.com site, San Francisco Page"""
name = "Jobsearch"
allowed_domains = ["jobsearch.about.com/"]
start_urls = ['http://jobsearch.about.com/od/job-titles/fl/job-titles-a-z.htm']
def parse(self, response):
count = 0
for sel in response.xpath('//*[#id="main"]/div/div[2]/div[1]/div/div[2]/article/div[2]/ul[1]'):
item = JobfetchItem()
item['title'] = sel.extract()
item['link'] = sel.extract()
count = count+1
print item
yield item
The problems you have in the code:
yield item should be inside the loop since you are instantiating items there
the xpath you have is pretty messy and not quite reliable since it heavily relies on the elements location inside parent tags and starts from almost the top parent of the document
your xpath is incorrect - it should go down to the a elements inside li inside ul
sel.extract() would only give you that ul element extracted
For the sake of an example, use a CSS selector here to get to the li tags:
import scrapy
from jobfetch.items import JobfetchItem
class JobFetchSpider(scrapy.Spider):
name = "Jobsearch"
allowed_domains = ["jobsearch.about.com/"]
start_urls = ['http://jobsearch.about.com/od/job-titles/fl/job-titles-a-z.htm']
def parse(self, response):
for sel in response.css('article[itemprop="articleBody"] div.expert-content-text > ul > li > a'):
item = JobfetchItem()
item['title'] = sel.xpath('text()').extract()[0]
item['link'] = sel.xpath('#href').extract()[0]
yield item
Running the spider produces:
{'link': u'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm', 'title': u'Accounting'}
{'link': u'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm', 'title': u'Administrative'}
...
{'link': u'http://jobsearch.about.com/od/job-title-samples/fl/yacht-job-titles.htm', 'title': u'Yacht Jobs'}
FYI, we could have used xpath() also:
//article[#itemprop="articleBody"]//div[#class="expert-content-text"]/ul/li/a
Use the below script to extract the data as you want to scrape.
In [1]: response.xpath('//div[#class="expert-content-text"]/ul/li/a/text()').extract()
Out[1]:
[u'Accounting',
u'Administrative',
u'Advertising',
u'Airline',
u'Animal',
u'Alternative Energy',
u'Auction House',
u'Banking',
u'Biotechnology',
u'Business',
u'Business Intelligence',
u'Chef',
u'College Admissions',
u'College Alumni Relations and Development ',
u'College Student Services',
u'Construction',
u'Consulting',
u'Corporate',
u'Cruise Ship',
u'Customer Service',
u'Data Science',
u'Engineering',
u'Entry Level Jobs',
u'Environmental',
u'Event Planning',
u'Fashion',
u'Film',
u'First Job',
u'Fundraiser',
u'Healthcare/Medical',
u'Health/Safety',
u'Hospitality',
u'Human Resources',
u'Human Services / Social Work',
u'Information Technology (IT)',
u'Insurance',
u'International Affairs / Development',
u'International Business',
u'Investment Banking',
u'Law Enforcement',
u'Legal',
u'Maintenance',
u'Management',
u'Manufacturing',
u'Marketing',
u'Media',
u'Museum',
u'Music',
u'Non Profit',
u'Nursing',
u'Outdoor ',
u'Public Administration',
u'Public Relations',
u'Purchasing',
u'Radio',
u'Real Estate ',
u'Restaurant',
u'Retail',
u'Sales',
u'School',
u'Science',
u'Ski and Snow Jobs',
u'Social Media',
u'Social Work',
u'Sports',
u'Television',
u'Trades',
u'Transportation',
u'Travel',
u'Yacht Jobs']
In [1]: response.xpath('//div[#class="expert-content-text"]/ul/li/a/#href').extract()
Out[2]:
[u'http://jobsearch.about.com/od/job-title-samples/a/accounting-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/admin-job-titles.htm',
u'http://jobsearch.about.com/od/job-titles/a/advertising-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/airline-industry-jobs.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/animal-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/alternative-energy-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/auction-house-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/banking-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/biotechnology-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/business-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/business-intelligence-job-titles.htm',
u'http://culinaryarts.about.com/od/culinaryfundamentals/a/whatisachef.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-admissions-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-alumni-relations-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/college-student-service-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/construction-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/consulting-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/c-level-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/cruise-ship-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/customer-service-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/data-science-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/engineering-job-titles.htm',
u'http://jobsearch.about.com/od/best-jobs/a/best-entry-level-jobs.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/environmental-job-titles.htm',
u'http://eventplanning.about.com/od/eventcareers/tp/corporateevents.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/fashion-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/film-job-titles.htm',
u'http://jobsearch.about.com/od/justforstudents/a/first-job-list.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/fundraiser-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/health-care-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/health-safety-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/hospitality-job-titles.htm',
u'http://humanresources.about.com/od/HR-Roles-And-Responsibilities/fl/human-resources-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/human-services-social-work-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/it-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/insurance-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/international-affairs-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/international-business-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/investment-banking-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/law-enforcement-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/legal-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/maintenance-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/management-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/manufacturing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/marketing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/media-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/museum-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/music-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/nonprofit-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/nursing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/outdoor-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/public-administration-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/public-relations-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/purchasing-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/radio-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/real-estate-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/restaurant-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/retail-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/sales-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/high-school-middle-school-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/science-job-titles.htm',
u'http://jobsearch.about.com/od/skiandsnowjobs/a/skijob2_2.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/social-media-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/social-work-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/sports-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/television-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/trades-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/a/transportation-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/travel-job-titles.htm',
u'http://jobsearch.about.com/od/job-title-samples/fl/yacht-job-titles.htm']
Related
XPATH issue while looping through tags
I have this piece of code, where I try to download these papers but the loop prints the first element only. import scrapy from urllib.parse import urljoin class SimpleSpider(scrapy.Spider): name = 'simple' start_urls = ['https://jmedicalcasereports.biomedcentral.com/articles?query=COVID-19&searchType=journalSearch&tab=keyword'] def parse(self, response): for book in response.xpath('//*[#id="main-content"]/div/main/div[2]/ol'): title= response.xpath('/li[3]/article/h3/a/text()').get() link = urljoin( 'https://jmedicalcasereports.biomedcentral.com/',response.xpath('/li[3]/article/ul/li[2]/a/#href').get() ) yield { 'Title':title, 'file_urls':[link] } I used css, and then xpath, problem is with loop code.
Firstly, in the third line of your code, response could be changed to title title= book.xpath('.//a/text()').get() Secondly, in your second line, you give an incorrect xpath. So the result is not correct. This is my code. Hope this can help you. def parse(self, response): for book in response.xpath('//li[#class = "c-listing__item"]'): title= book.xpath('.//a/text()').get() link = urljoin( 'https://jmedicalcasereports.biomedcentral.com/',book.xpath('.//a/#href').get() ) yield { 'Title':title, 'file_urls':[link] } The response is : {'Title': 'Presentation of COVID-19 infection with bizarre behavior and encephalopathy: a case report', 'file_urls': ['https://jmedicalcasereports.biomedcentral.com/articles/10.1186/s13256-021- 02851-0']} 2022-04-17 21:54:27 [scrapy.core.scraper] DEBUG: Scraped from <200 https://jmedicalcasereports.biomedcentral.com/articles?query=COVID- 19&searchType=journalSearch&tab=keyword> {'Title': 'Dysentery as the only presentation of COVID-19 in a child: a\xa0case report', 'file_urls': ['https://jmedicalcasereports.biomedcentral.com/articles/10.1186/s13256-021- 02672-1']}
Need to Extract contents of subpages using scrapy
I'm fairly new to scrapy but have made a few simple scrapers work for me. I'm trying to go to the next level by getting all the links from one page and scraping the content of the subpages. I've read up a few different examples and Q&As but can't seem to get this code to work for me. import scrapy from ..items import remoteworkhub_jobs class remoteworkhub(scrapy.Spider): name = 'remoteworkhub' allowed_domains = ['www.remoteworkhub.com'] #start_urls = ['https://jobs.remoteworkhub.com/'] start_urls = ['https://jobs.remoteworkhub.com'] # Scrape the individual job urls and pass them to the spider def parse(self, response): links = response.xpath('//a[#class="jobList-title"]/#href').extract() for jobs in links: base_url = 'https://jobs.remoteworkhub.com' Url = base_url + jobs yield scrapy.Request(Url, callback=self.parsejobpage) def parsejobpage(self, response): #Extracting the content using css selectors titles = response.xpath('//h1[#class="u-mv--remove u-textH2"]/text()').extract() companys = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[2]/div[2]/div/div[1]/strong/a/text()').extract() categories = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[3]/ul/li/a/text()').extract() worktype = response.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div[1]/div[5]/div[2]/span/text()').extract() job_decription = response.xpath('//div[#class="job-body"]//text()').extract() #titles = response.css('.jobDetail-headerIntro::text').extract() #titles = response.xpath('//title').get() #votes = response.css('.score.unvoted::text').extract() #times = response.css('time::attr(title)').extract() #comments = response.css('.comments::text').extract() item = remoteworkhub_jobs() #item['jobUrl'] = jobUrl item['title'] = titles #item['company'] = companys #item['category'] = categories #item['worktype'] = worktype #item['job_description'] = job_decription #yield or give the scraped info to scrapy yield item
Check out the following implementation which should let you parse job title and their concerning company names from that site. The way you have defined xpaths are error prone. However, I've modified them so that they can work in the right way. Give it a shot: import scrapy class remoteworkhub(scrapy.Spider): name = 'remoteworkhub' start_urls = ['https://jobs.remoteworkhub.com'] def parse(self, response): for job_link in response.xpath("//*[contains(#class,'job-listing')]//*[#class='jobList-title']/#href").extract(): Url = response.urljoin(job_link) yield scrapy.Request(Url, callback=self.parsejobpage) def parsejobpage(self, response): d = {} d['title'] = response.xpath("//*[#class='jobDetail-headerIntro']/h1/text()").get() d['company'] = response.xpath("//*[#class='jobDetail-headerIntro']//strong//text()").get() yield d This is the kind of output I can see in the console if I use print instead of yield: {'title': 'Sr Full Stack Developer, Node/React - Remote', 'company': 'Clevertech'} {'title': 'Subject Matter Expert, Customer Experience - Remote', 'company': 'Qualtrics'} {'title': 'Employee Experience Enterprise Account Executive - Academic and Government - Remote', 'company': 'Qualtrics'} {'title': 'Senior Solutions Consultant, Brand Experience - Remote', 'company': 'Qualtrics'} {'title': 'Data Analyst - Remote', 'company': 'Railsware'} {'title': 'Recruitment Manager - Remote', 'company': 'Railsware'}
Python Recursive Scraping with Scrapy
I'm trying to make a scraper that will pull links, titles, prices and the body of posts on craigslist. I have been able to get the prices, but it returns the price for every listing on the page, not just for the specific row. I am also unable to get it to go to the next page and continue scraping. This is the tutorial I am using - http://mherman.org/blog/2012/11/08/recursively-scraping-web-pages-with-scrapy/ I've tried suggestions from this thread, but still can't make it work - Scrapy Python Craigslist Scraper The page I'm trying to scrape is - http://medford.craigslist.org/cto/ In the link price variable, if I remove the // before span[#class="l2"] it returns no prices, but if I leave it there it includes every price on the page. For the rules, I've tried playing with the class tags but it seems to hang on the first page. I'm thinking I might need separate spider classes? Here is my code: #------------------------------------------------------------------------------- # Name: module1 # Purpose: # # Author: CD # # Created: 02/03/2014 # Copyright: (c) CD 2014 # Licence: <your licence> #------------------------------------------------------------------------------- from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from craigslist_sample.items import CraigslistSampleItem from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.http import Request from scrapy.selector import * import sys class PageSpider(BaseSpider): name = "cto" allowed_domains = ["medford.craigslist.org"] start_urls = ["http://medford.craigslist.org/cto/"] rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//span[#class="button next"]' ,)) , callback="parse", follow=True), ) def parse(self, response): hxs = HtmlXPathSelector(response) titles = hxs.select('//span[#class="pl"] | //span[#class="l2"]') for title in titles: item = CraigslistSampleItem() item['title'] = title.select("a/text()").extract() item['link'] = title.select("a/#href").extract() item['price'] = title.select('//span[#class="l2"]//span[#class="price"]/text()').extract() url = 'http://medford.craigslist.org{}'.format(''.join(item['link'])) yield Request(url=url, meta={'item': item}, callback=self.parse_item_page) def parse_item_page(self, response): hxs = HtmlXPathSelector(response) item = response.meta['item'] item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract() return item
The idea is simple: find all paragraphs in a div with a class="content". Then from every paragraph extract link, text link and a price. Note that select() method is deprecated currentlty, use xpath() instead. Here's a modified version of parse() method: def parse(self, response): hxs = HtmlXPathSelector(response) rows = hxs.select('//div[#class="content"]/p[#class="row"]') for row in rows: item = CraigslistSampleItem() link = row.xpath('.//span[#class="pl"]/a') item['title'] = link.xpath("text()").extract() item['link'] = link.xpath("#href").extract() item['price'] = row.xpath('.//span[#class="l2"]/span[#class="price"]/text()').extract() url = 'http://medford.craigslist.org{}'.format(''.join(item['link'])) yield Request(url=url, meta={'item': item}, callback=self.parse_item_page) This is a sample of what I'm getting: {'description': [u"\n\t\tHave a nice, sturdy, compact car hauler/trailer. May be used for other hauling like equipstment, ATV's and the like, Very solid and in good shape. Parice to sell at only $995. Call Bill at 541 944 2929 top see or Roy at 541 9733421. \n\t"], 'link': [u'/cto/4354771900.html'], 'price': [u'$995'], 'title': [u'compact sturdy car trailer ']} Hope that helps.
Python Scrapy for grabbing table columns and rows
I'm relatively a noob at python and it's my first time learning scrapy. I've done data mining with perl quite successfully before, but this is a whole different ballgame! I'm trying to scrape a table, grab the columns of each row. My code is below. items.py from scrapy.item import Item, Field class Cio100Item(Item): company = Field() person = Field() industry = Field() url = Field() scrape.py (the spider) from scrapy.spider import BaseSpider from scrapy.selector import Selector from cio100.items import Cio100Item items = [] class MySpider(BaseSpider): name = "scrape" allowed_domains = ["cio.co.uk"] start_urls = ["http://www.cio.co.uk/cio100/2013/cio/"] def parse(self, response): sel = Selector(response) tables = sel.xpath('//table[#class="bgWhite listTable"]//h2') for table in tables: # print table item = Cio100Item() item['company'] = table.xpath('a/text()').extract() item['person'] = table.xpath('a/text()').extract() item['industry'] = table.xpath('a/text()').extract() item['url'] = table.xpath('a/#href').extract() items.append(item) return items I'm have some trouble understanding how to articulate the xpath selection correctly. I think this line is the problem: tables = sel.xpath('//table[#class="bgWhite listTable"]//h2') When I run the scraper as is above the result is I get things like this in terminal: 2014-01-13 22:13:29-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/> {'company': [u"\nDomino's Pizza\n"], 'industry': [u"\nDomino's Pizza\n"], 'person': [u"\nDomino's Pizza\n"], 'url': [u'/cio100/2013/dominos-pizza/']} 2014-01-13 22:13:29-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/> {'company': [u'\nColin Rees\n'], 'industry': [u'\nColin Rees\n'], 'person': [u'\nColin Rees\n'], 'url': [u'/cio100/2013/dominos-pizza/']} Ideally I want only one block, not two, with Domino's in the company slot, Colin in the person slot, and the industry grabbed, which it's not doing. When I use firebug to inspect the table, I see h2 for columns 1 and 2 (company and person) but column 3 is h3? When I modify the tables line to h3 at the end, as follows tables = sel.xpath('//table[#class="bgWhite listTable"]//h3') I get this 2014-01-13 22:16:46-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/> {'company': [u'\nRetail\n'], 'industry': [u'\nRetail\n'], 'person': [u'\nRetail\n'], 'url': [u'/cio100/2013/dominos-pizza/']} Here it only produces 1 block, and it's capturing Industry and the URL correctly. But it's not getting the company name or person. Any help will be greatly appreciated! Thanks!
as far as the xpath goes, consider doing something like: $ scrapy shell http://www.cio.co.uk/cio100/2013/cio/ ... >>> for tr in sel.xpath('//table[#class="bgWhite listTable"]/tr'): ... item = Cio100Item() ... item['company'] = tr.xpath('td[2]//a/text()').extract()[0].strip() ... item['person'] = tr.xpath('td[3]//a/text()').extract()[0].strip() ... item['industry'] = tr.xpath('td[4]//a/text()').extract()[0].strip() ... item['url'] = tr.xpath('td[4]//a/#href').extract()[0].strip() ... print item ... {'company': u'LOCOG', 'industry': u'Leisure and entertainment', 'person': u'Gerry Pennell', 'url': u'/cio100/2013/locog/'} {'company': u'Laterooms.com', 'industry': u'Leisure and entertainment', 'person': u'Adam Gerrard', 'url': u'/cio100/2013/lateroomscom/'} {'company': u'Vodafone', 'industry': u'Communications and IT services', 'person': u'Albert Hitchcock', 'url': u'/cio100/2013/vodafone/'} ... other than that you better yield items one by one rather than accumulating them in a list
Scrapy: one row per item
I'm using scrapy to scrape category pages for products from architonic*com. However, I would like to display these products in a csv one per row. In the current situation all the brand names from a given category page are listed under 'brand' while I would like to have an output like this: {'brand': [u'Elisabeth Ellefsen'], 'title': [u'Up chair I 907'], 'img_url': [u'http://image.architonic.com/img_pro1-1/117/4373/t-up-06f-sq.jpg'], 'link': [u'http://www.architonic.com/pmsht/up-chair-tonon/1174373'] } I tried playing with the Item Loaders (added default_output_processor= TakeFirst()), adding 'yield item' (see commented code) and searched two days to find a solution without luck. Hoping someone is willing to help me. Any help is really appreciated. My output looks like this: 2013-01-14 11:53:23+0100 [archi] DEBUG: Scraped from <200 http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/3> {'brand': [u'Softline', u'Elisabeth Ellefsen', u'Sellex', u'Lievore Altherr Molina', u'Poliform', ..... u'Hans Thyge & Co.'], 'img_url': [u'http://image.architonic.com/img_pro1-1/117/3661/terra-h-sq.jpg', u'http://image.architonic.com/img_pro1-1/117/0852/fly-01-sq.jpg', u'http://image.architonic.com/img_pro1-1/116/9870/ley-0004-sq.jpg', u'http://image.architonic.com/img_pro1-1/117/1023/arflex-hollywood-03-sq.jpg', ... u'http://image.architonic.com/img_pro1-1/118/5357/reef-002-sq.jpg'], 'link': [u'http://www.architonic.com/pmsht/terra-softline/1173661', u'http://www.architonic.com/pmsht/fly-sellex/1170852', u'http://www.architonic.com/pmsht/ley-poliform/1169870', ..... u'http://www.architonic.com/pmsht/reef-collection-labofa/1185357'], 'title': [u'Terra', u'Fly', u'Ley chair', ..... u'Hollywood Sofa', u'Pouff Round']} I'm using this in spider/archi_spider.py import string import re from scrapy.spider import BaseSpider from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.utils.markup import remove_entities from archiscraper.items import ArchiItemFields, ArchiLoader class ArchiScraper(BaseSpider): name = "archi" allowed_domains = ["architonic.com"] start_urls = ['http://www.architonic.com/pmpro/home-furnishings/3210002/2/2/%s' % page for page in xrange(1, 4)] # rules = (Rule(SgmlLinkExtractor(allow=('.', ),restrict_xpaths=('//*[#id="right_arrow"]',)) # , callback="parse_items", follow= True), # ) # def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//li[contains(#class, "nav_pro_item")]') items = [] for site in sites: item = ArchiLoader(ArchiItemFields(), site) item.add_xpath('brand', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[1][self::text()]') item.add_xpath('designer', '//*[contains(#class, "nav_pro_text")]/a/br/following-sibling::node()[3][self::text()]') item.add_xpath('title', '//*[contains(#class, "nav_pro_text")]/a/strong/text()') item.add_xpath('img_url', '//li[contains(#class, "nav_pro_item")]/div/a/img/#src[1]') item.add_xpath('link', '//*[contains(#class, "nav_pro_text")]/a/#href') items.append(item.load_item()) return items # for item in items: # yield item items.py # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/topics/items.html import string from scrapy.item import Item, Field from scrapy.contrib.loader.processor import MapCompose, Join, TakeFirst from scrapy.utils.markup import remove_entities from scrapy.contrib.loader import XPathItemLoader class ArchiItem(): pass class ArchiItemFields(Item): brand = Field() title = Field() designer = Field() img_url = Field() img = Field() link = Field() pass class ArchiLoader(XPathItemLoader): # default_input_processor = MapCompose(unicode.strip) # default_output_processor= TakeFirst() brand_out = MapCompose(unicode.strip) # title_out = Join()