Scrapy collect data from first element and post's title - python

I need Scrapy to collect data from this tag and retrieve all three parts in one piece. The output would be something like:
Tonka double shock boys bike - $10 (Denver).
<span class="postingtitletext">Tonka double shock boys bike - <span class="price">$10</span><small> (Denver)</small></span>
Second is to collect data from first span tag. So the result would be only:
2016 2004 Pontiac Grand Prix gt.
<p class="attrgroup"><span><b>2016 2004 Pontiac Grand Prix gt</b></span> <span>odometer: <b>164</b></span> <span>fuel : <b>gas</b></span> <span>transmission : <b>automatic</b></span> <span>title status : <b>clean</b></span></p>
Here is my code so far:
# -*- coding: utf-8 -*-
# scrapy crawl dmoz -o items.csv -t csv
import re
import scrapy
from scrapy.http import Request
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://jxn.craigslist.org/search/cto?"
]
BASE_URL = 'http://jxn.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/nos/vgm/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"]=response.xpath("//p[#class='attrgroup']/span/b/text()").extract()
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item

For posting title, get all the text nodes from the span tag and join them:
$ scrapy shell http://denver.craigslist.org/bik/5042090428.html
In [1]: "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
Out[1]: u'Tonka double shock boys bike - $10 (Denver)'
Note that the "Scrapy-way" to do this would be to use an ItemLoader and the Join() processor.
Second is to collect data from first span tag.
Since you haven't provided an example input data, here is an educated guess:
response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0]

Related

Crawl iframe and page at the same time

I just wanted to know if it's possible to crawl a page on a website and extract data from this page and from an iframe in this page at the same time?
I'm using scrapy with python and I already know how to extract data from the iframe...
Thank you for your help!!
Thanks to your answer, I made this... But I don't know what to put instead of 'url'... Can you help me again please?
# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from fnac.items import FnacItem
from urllib.request import urlopen
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from bs4 import BeautifulSoup
class Fnac(CrawlSpider): #scrapy.Spider
name = 'FnacCom'
allowed_domains = ['fnac.com']
start_urls = ['http://www.fnac.com/MORMANE/srefA5533119-3387-5EC4-82B6-AA61216BF599']
##### To extract links in order to run the spider in them
# rules = (
# Rule(LinkExtractor(allow=()), callback='parse'),
# )
def parse(self, response):
soup = BeautifulSoup(urlopen(response.url), "lxml")
iframexx = soup.find_all('iframe')
for iframe in iframexx:
yield scrapy.Request(iframe.attrs['src'],callback=self.parse2)
##### Main function
def parse1(self, response):
item1 = FnacItem()
nb_sales = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/span/text()').extract()
country = response.xpath('//table[#summary="données détaillée du vendeur"]/tbody/tr/td/text()').extract()
yield scrapy.Request(url, meta={'item': item1}) #I don't know what to put instead of URL...
def parse2(self, response):
same_item = response.meta['item']
address = response.xpath('//div/p/text()').re(r'.*Adresse \: (.*)\n?.*')
email = response.xpath('//div/ul/li[contains(text(),"#")]/text()').extract()
name = response.xpath('//div/p[#class="customer-policy-label"]/text()').re(r'Infos sur la boutique \: ([a-zA-Z0-9]*)')
phone = response.xpath('//div/p/text()').re(r'.*Tél \: ([\d]*)\n?.*')
siret = response.xpath('//div/p/text()').re(r'.*Siret \: ([\d]*)\n?.*')
vat = response.xpath('//div/text()').re(r'.*TVA \: (.*)')
if (len(name) != 0):
item['name'] = ''.join(name).strip()
item['address'] = ''.join(address).strip()
item['phone'] = ''.join(phone).strip()
item['email'] = ''.join(email).strip()
item['nb_sales'] = ''.join(nb_sales).strip()
item['country'] = ''.join(country).strip()
item['vat'] = ''.join(vat).strip()
item['siret'] = ''.join(siret).strip()
return item
to combine information from different requests into a similar item, you have to use the meta parameter of the requests:
def parse1(self, response):
item1 = {
...
}
yield Request(url='another_url.com', meta={'item': item1}, callback=self.parse2)
def parse2(self, response):
same_item = response.meta['item']
# keep populating the item with the second response
...
yield same_item

How to do multiple page scraping using Scrapy?

#----\
#-----*-----\
#----/ \
\
#----\ \
#-----*-------- * <-- START
#----/ /
/
#----\ /
#-----*-----/
#----/
Here is a structure of a website I want to scrap with scrapy, where * is a page and --- indicates link. I want to scrape data of # pages.
I have already done a scraper which can scrape data from a single # page.
import scrapy
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/ency/article/000178.htm']
def parse(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
The starting webpage is https://medlineplus.gov/encyclopedia.html
I want to scrape info about all diseases in the encyclopedia.
You would need to start with the "encyclopedia.html" page, follow the "alpha" links (the A-Z articles links), then, for every followed page, follow the links to the articles.
You can do this with a CrawlSpider and the Link Extractors, but, since the crawling depth is small, we can do this with a regular Spider:
from urlparse import urljoin # Python 2 only
import scrapy
from scrapy.http import Request
class MyItem(scrapy.Item):
topic = scrapy.Field()
symptoms = scrapy.Field()
class MedicalSpider(scrapy.Spider):
name = "medical"
allowed_domains = ['medlineplus.gov']
start_urls = ['https://medlineplus.gov/encyclopedia.html']
def parse(self, response):
for link in response.css("ul.alpha-links li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_alpha_page)
def parse_alpha_page(self, response):
for link in response.css("ul#index li a::attr(href)").extract():
yield Request(urljoin(response.url, link), callback=self.parse_page)
def parse_page(self, response):
item = MyItem()
item["topic"] = response.css('h1.with-also::text').extract_first()
item["symptoms"] = response.css("article div#section-2 li::text").extract()
yield item
Note that it looks like there is a better way to get the desired data from the MedlinePlus (check out the "For Developers" page).

Get the Type category for the URLs using scrapy

For this URL , I need all the product URLs and their respective TYPE.
So the output should be:
Product_URL1 Blouse
Product_URL2 Crop Top
Product_URL3 Tank Top
Product_URL4 Strappy Top
Product_URL5 Tube Top
Below is my code, I guess everything is right expect the xpath for the item['type']
from scrapy.spiders import CrawlSpider
import scrapy
from scrapy.http.request import Request
class JabongItem(scrapy.Item):
base_link = scrapy.Field()
type = scrapy.Field()
count = scrapy.Field()
product_name = scrapy.Field()
product_link = scrapy.Field()
class JabongScrape(CrawlSpider):
name = "jabong"
allowed_domains = "jabong.com"
start_urls = ["http://www.jabong.com/women/clothing/tops-tees-shirts/tops", "http://www.jabong.com/women/clothing/tops-tees-shirts/tees"]
def parse(self, response):
item=JabongItem()
try:
for idx in range(0, 20):
item['type']=response.xpath("//div[contains(#class, 'options')]/label/a/text()").extract()[idx]
item['base_link']=response.url+response.xpath("//div[contains(#class, 'options')]/label/a/#href").extract()[idx] + "?ax=1&page=1&limit=" + (response.xpath("//div[contains(#class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","") + "&sortField=popularity&sortBy=desc"
item['count']= (response.xpath("//div[contains(#class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","")
yield Request(item['base_link'],callback=self.parse_product_link,
meta={'item': item, 'count': int(item['count'])}, dont_filter=True)
except:
pass
def parse_product_link(self,response):
item=response.meta['item']
try:
for i in range(0, response.meta['count']):
item['product_link']=response.xpath("//div[contains(#class, 'col-xxs-6 col-xs-4 col-sm-4 col-md-3 col-lg-3 product-tile img-responsive')]/a/#href").extract()[i]
# item['original_price']=response.xpath("section.row > div:nth-child(1) > a:nth-child(1) > div:nth-child(2) > div:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text").extract()[idx]
print i
yield item
except:
pass
And the jbng_base_links.txt contains "http://www.jabong.com/women/clothing/tops-tees-shirts/tops"
As Rafael pointed out the easiest way of doing this is simply restructuring your spider manually to follow this order:
Go to webpage
Find type urls
Go to every type url -> scrape items
It could be as simple as:
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = []
def parse(self, response):
"""this will parse landing page for type urls"""
urls = response.xpath("//div[contains(text(),'Type')]/..//a/#href").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Requests(url, self.parse_type)
def parse_type(self, response):
"""this will parse every type page for items"""
type_name = response.xpath("//a[#class='filtered-brand']/text()").extract_first()
product_urls = ...
for url in product_urls:
yield {'type': type_name, 'url': url}
# handle next page

Python Recursive Scraping with Scrapy

I'm trying to make a scraper that will pull links, titles, prices and the body of posts on craigslist. I have been able to get the prices, but it returns the price for every listing on the page, not just for the specific row. I am also unable to get it to go to the next page and continue scraping.
This is the tutorial I am using - http://mherman.org/blog/2012/11/08/recursively-scraping-web-pages-with-scrapy/
I've tried suggestions from this thread, but still can't make it work - Scrapy Python Craigslist Scraper
The page I'm trying to scrape is - http://medford.craigslist.org/cto/
In the link price variable, if I remove the // before span[#class="l2"] it returns no prices, but if I leave it there it includes every price on the page.
For the rules, I've tried playing with the class tags but it seems to hang on the first page. I'm thinking I might need separate spider classes?
Here is my code:
#-------------------------------------------------------------------------------
# Name: module1
# Purpose:
#
# Author: CD
#
# Created: 02/03/2014
# Copyright: (c) CD 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import *
import sys
class PageSpider(BaseSpider):
name = "cto"
allowed_domains = ["medford.craigslist.org"]
start_urls = ["http://medford.craigslist.org/cto/"]
rules = (Rule(SgmlLinkExtractor(allow=("index\d00\.html", ), restrict_xpaths=('//span[#class="button next"]' ,))
, callback="parse", follow=True), )
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"] | //span[#class="l2"]')
for title in titles:
item = CraigslistSampleItem()
item['title'] = title.select("a/text()").extract()
item['link'] = title.select("a/#href").extract()
item['price'] = title.select('//span[#class="l2"]//span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
def parse_item_page(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
item['description'] = hxs.select('//section[#id="postingbody"]/text()').extract()
return item
The idea is simple: find all paragraphs in a div with a class="content". Then from every paragraph extract link, text link and a price. Note that select() method is deprecated currentlty, use xpath() instead.
Here's a modified version of parse() method:
def parse(self, response):
hxs = HtmlXPathSelector(response)
rows = hxs.select('//div[#class="content"]/p[#class="row"]')
for row in rows:
item = CraigslistSampleItem()
link = row.xpath('.//span[#class="pl"]/a')
item['title'] = link.xpath("text()").extract()
item['link'] = link.xpath("#href").extract()
item['price'] = row.xpath('.//span[#class="l2"]/span[#class="price"]/text()').extract()
url = 'http://medford.craigslist.org{}'.format(''.join(item['link']))
yield Request(url=url, meta={'item': item}, callback=self.parse_item_page)
This is a sample of what I'm getting:
{'description': [u"\n\t\tHave a nice, sturdy, compact car hauler/trailer. May be used for other hauling like equipstment, ATV's and the like, Very solid and in good shape. Parice to sell at only $995. Call Bill at 541 944 2929 top see or Roy at 541 9733421. \n\t"],
'link': [u'/cto/4354771900.html'],
'price': [u'$995'],
'title': [u'compact sturdy car trailer ']}
Hope that helps.

Scrapy crawls first page, does not follow other links

Im working on a scrapy that crawls this website :
Page 1: http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626
Subpage 1 example (from page 1) : http://www.randstad.nl/mwp2/faces/baanDetails?aanvraagnummer=1177658&_adf.ctrl-state=16ovo4scmu_4&sc=0&_afrLoop=15790145645866794
Page 2 : http://www.randstad.nl/mwp2/faces/baanDetails?aanvraagnummer=1509606&_adf.ctrl-state=16ovo4scmu_4&sc=0&_afrLoop=15790170887272918
So what (i think) goes wrong is it gets all the links from page 1, goes to the sub pages (so it goes to the links it extracted, "subpages") and then goes to page 2 and do it again, but i think that after page 1 it only get the first link (instead of all links of page 2) and then continue to page 3 and do the same.
I tried a lot of different code and i still can't get it right, i hope you can have a look at my code and help me out what i do wrong
Code Spider
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from craig.items import CraigItem
from scrapy.http import Request
import re
class CraigSpiderSpider(CrawlSpider):
name = "craig_spider"
allowed_domains = ["randstad.nl"]
start_urls = (
"http://www.randstad.nl/mwp2/faces/baanZoeken?pagina=1&filters=vakgebied!5626",
"http://www.randstad.nl/mwp2/faces/baanZoeken?"
)
rules = (Rule (SgmlLinkExtractor(allow=("filters=vakgebied!5626", "pagina=")), callback="parse", follow= True),
)
def parse(self, response):
sel = Selector(response)
#Haalt alle links op
for link in sel.xpath(".//a[contains(#class, 'outer-read-more-link')]/#href").extract():
yield Request(link, callback=self.parse)
#Gaat alle links af en haalt alle text op
text_list = sel.xpath('//div[#id="basePage:page:twoColumn:r2:0:functieOmschrijvingPanel::content"]/text()').extract()
title_list = sel.xpath('//div[#id="basePage:page:panelTitleHeader"]//td[#class="af_panelBox_header-text"]//h1[#class="af_panelBox_header-element"]/text()').extract()
label_samenvatting = sel.xpath('//div[#id="basePage:page:twoColumn:r1:0:pfl1b"]//table//td//label/text()').extract()
opleidingniveau_list = sel.xpath('//div[#id="basePage:page:twoColumn:r1:0:pl1"]//ul//li/text()').extract()
soortbaan_list = sel.xpath('//table[#id="basePage:page:twoColumn:r1:0:soortDienstverbandRNL"]//td[#class="AFContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
uren_per_week_list = sel.xpath('//tr[#id="basePage:page:twoColumn:r1:0:it5"]//td[#class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
vakgebied_list = sel.xpath('//tr[#id="basePage:page:twoColumn:r1:0:vakgebieden"]//td[#class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]//li/text()').extract()
branche_list = sel.xpath('//tr[#id="basePage:page:twoColumn:r1:0:aanvraagBranch"]//td[#class="AFPanelFormLayoutContentCell af_panelLabelAndMessage_content-cell"]/text()').extract()
datum = sel.xpath('//span[#class="date-changed"]/text()').extract()
if text_list:
title = ' '.join(title_list)
text = ' '.join(text_list)
samenvatting = ' '.join(label_samenvatting)
opleidingniveau = ' '.join(opleidingniveau_list)
soortbaan = ' '.join(soortbaan_list)
urenperweek = ' '.join(uren_per_week_list)
vakgebied = ' '.join(vakgebied_list)
branche = ' '.join(branche_list)
item = CraigItem()
item['link'] = response.url
item['title'] = title
item['text'] = text
item['samenvatting'] = samenvatting
item['opleidingniveau'] = opleidingniveau
item['soortbaan'] = soortbaan
item['urenperweek'] = urenperweek
item['vakgebied'] = vakgebied
item['branche'] = branche
item['date'] = datum
yield item
Code Items
from scrapy.item import Item, Field
class CraigItem(Item):
title = Field()
text = Field()
link = Field()
site = Field()
date = Field()
samenvatting = Field()
opleidingniveau = Field()
soortbaan = Field()
urenperweek = Field()
vakgebied = Field()
branche = Field()
I think you should use CrawlSpider when you need following links, but not BaseSpider.
class CraigSpider(CrawlSpider):

Categories

Resources