Scrapy only returning html, without content - python

I am trying to get the response...
<span id="BusinessDbaName" class="dataItem">TECHNO COATINGS INC</span>
scrapy is instead returning...
******name = [u'<span id="BusinessDbaName" class="dataItem"></span>']
i.e. I am having the html returned but not the content within the tags.
Question: What would cause this, and how do I fix it?
Here is my source code:
import scrapy
class lniSpider(scrapy.Spider):
name = "lni"
allowed_domains = ["secure.lni.wa.gov"]
start_urls = [
"https://secure.lni.wa.gov/verify/Detail.aspx?UBI=602123234&SAW="
]
def parse(self, response):
for sel in response.xpath('//body'):
name = sel.xpath('//*[#id="BusinessDbaName"]').extract()
print ("******name = "), name

Related

Is this Sprite SVG possible to scrape?

Can I scrape this with standard Scrapy or do I need to use Selenium?
The html is:
<td class="example"><sprite-svg name="EXAMPLE2"><svg><use
xlink:href="/spritemap/1_0_30#sprite-EXAMPLE2"></use></svg></sprite-svg></td>
I need the value "EXAMPLE2" somehow.
The xpath which works in the browser is //td[#class='example']//*[local-name() = 'svg']
When I put it into scrapy I use the following code but am getting XPATH error.
'example' : div.xpath(".//td[#class='example']//*[local-name() = 'svg']
()").extract()
Any ideas how to scrape it?
Looking at the table, each svg sprite is under a class 'rug_X'
Something like
import scrapy
class RaceSpider(scrapy.Spider):
name = 'race'
allowed_domains = ['thedogs.com.au']
start_urls = ['https://www.thedogs.com.au/racing/gawler/2020-07-07/1/the-bunyip-maiden-stake-pr2-division1']
item = {}
def parse(self, response):
row = response.xpath('//tbody/tr')
dog = a.xpath('.//td[#class="table__cell--tight race-runners__name"]/div/a/text()').get()
number = a.xpath('.//td[#class="table__cell--tight race-runners__box"]/sprite-svg/#name').get()
cleaned_num = int(number.replace('rug_',''))
grade = a.xpath('.//td[#class="race-runners__grade"]/text()').get()
item = {'grade':grade, 'greyhound':dog,'rug':cleaned_num}
yield item
You could also use item loaders with a custom function to clean up the response you get.
Yes. You can do it with scrapy :
response.xpath("//td[#class='table__cell--tight race-runners__box']/sprite-svg/#name").getall()
Working scrapy code :
import scrapy
class Test(scrapy.Spider):
name = 'Test'
start_urls = [
'https://www.thedogs.com.au/racing/gawler/2020-07-07/1/the-bunyip-maiden-stake-pr2-division1']
def parse(self, response):
return {"nameList": response.xpath("//td[#class='table__cell--tight race-runners__box']/sprite-svg/#name").getall()}

Can't scrape next page contents using Scrapy

I want to scrape the contents from the next pages too but it didn't go to the next page. My code is:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['startech.com.bd/component/processor']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
print ('\n')
print (name)
print (price)
print ('\n')
next_page_url = response.xpath('//*[#class="pagination"]/li/a/#href').extract_first()
# absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url)
I didn't use the urljoin because the next_page_url is giving me the whole url. I also tried the dont_filter=true argument in the yield function which gives me an infinite loop through the 1st page. The message I'm getting from the terminal is [scrapy.spidermiddlewares.offsite] DEBUG: Filtered offsite request to 'www.startech.com.bd': https://www.startech.com.bd/component/processor?page=2>
This is because your allowed_domains variable is wrong, use allowed_domains = ['www.startech.com.bd'] instead (see the doc).
You can also modify your next page selector in order to avoid going to page one again:
import scrapy
class AggregatorSpider(scrapy.Spider):
name = 'aggregator'
allowed_domains = ['www.startech.com.bd']
start_urls = ['https://startech.com.bd/component/processor']
def parse(self, response):
processor_details = response.xpath('//*[#class="col-xs-12 col-md-4 product-layout grid"]')
for processor in processor_details:
name = processor.xpath('.//h4/a/text()').extract_first()
price = processor.xpath('.//*[#class="price space-between"]/span/text()').extract_first()
yield({'name': name, 'price': price})
next_page_url = response.css('.pagination li:last-child a::attr(href)').extract_first()
if next_page_url:
yield scrapy.Request(next_page_url)

Scrapy: Looping through search results only returns first item

I'm scraping a site by going through the search page, then looping through all results within. However it only seems to be returning the first result for each page. I also don't think it's hitting the start page's results either.
Secondly, the price is returning as some sort of Unicode (£ symbol) - how can I remove it altogether just leaving the price?
'regular_price': [u'\xa38.59'],
Here is the HTML:
http://pastebin.com/F8Lud0hu
Here's the spider:
import scrapy
import random
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from cdl.items import candleItem
class cdlSpider(CrawlSpider):
name = "cdl"
allowed_domains = ["www.xxxx.co.uk"]
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(
allow=['advanced_search_result\.php\?sort=2a&page=\d*']),
callback='parse_listings',
follow=True)
]
def parse_listings(self, response):
sel = Selector(response)
urls = sel.css('a.product_img')
for url in urls:
url = url.xpath('#href').extract()[0]
return scrapy.Request(url,callback=self.parse_item)
def parse_item(self, response):
candle = candleItem()
n = response.css('.prod_info_name h1')
candle['name'] = n.xpath('.//text()').extract()[0]
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price').xpath('.//text()').extract()
else:
candle['was_price'] = response.css('.was_price strong').xpath('.//text()').extract()
candle['now_price'] = response.css('.now_price strong').xpath('.//text()').extract()
candle['referrer'] = response.request.headers.get('Referer', None)
candle['url'] = response.request.url
yield candle
Yes it's returning only the first result because of your parse_listing method (you're returning the first url and you should be yielding it). I would do something like:
def parse_listings(self, response):
for url in response.css('a.product_img::attr(href)').extract():
yield Request(url, callback=self.parse_item)
In that case I would even do something like:
class CdlspiderSpider(CrawlSpider):
name = 'cdlSpider'
allowed_domains = ['www.xxxx.co.uk']
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']
rules = [
Rule(LinkExtractor(allow='advanced_search_result\.php\?sort=2a&page=\d*')),
Rule(LinkExtractor(restrict_css='a.product_img'), callback='parse_item')
]
def parse_item(self, response):
...
if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price::text').re_first(r'\d+\.?\d*')
else:
candle['was_price'] = response.css('.was_price strong::text').re_first(r'\d+\.?\d*')
candle['now_price'] = response.css('.now_price strong::text').re_first(r'\d+\.?\d*')
...
return candle
To remove the £, just replace it with an empty string like this:
pricewithpound = u'\xa38.59'
price = pricewithpound.replace(u'\xa3', '')
To investigate the scrapy issue, can you please provide the HTML source ?

Python: Scrapy start_urls list able to handle .format()?

I want to parse a list of stocks so I am trying to format the end of my start_urls list so I can just add the symbol instead of the entire url.
Spider class with start_urls inside stock_list method:
class MySpider(BaseSpider):
symbols = ["SCMP"]
name = "dozen"
allowed_domains = ["yahoo.com"]
def stock_list(stock):
start_urls = []
for symb in symbols:
start_urls.append("http://finance.yahoo.com/q/is?s={}&annual".format(symb))
return start_urls
def parse(self, response):
hxs = HtmlXPathSelector(response)
revenue = hxs.select('//td[#align="right"]')
items = []
for rev in revenue:
item = DozenItem()
item["Revenue"] = rev.xpath("./strong/text()").extract()
items.append(item)
return items[0:3]
It all runs correctly if I get rid of the stock_list and just do simple start_urls as normal, but as it currently is will not export more than an empty file.
Also, should I possibly try a sys.arv setup so that I would just type the stock symbol as an argument at the command line when I run $ scrapy crawl dozen -o items.csv???
Typically the shell prints out 2015-04-25 14:50:57-0400 [dozen] DEBUG: Crawled (200) <GET http://finance.yahoo.com/q/is?s=SCMP+Income+Statement&annual> among the LOG/DEBUG printout, however does not currently include it, implying it isn't correctly formatting the start_urls
The proper way for implementing dynamic start URL's is to use start_request().
Using start_urls is the preferred practice when you have a static list of starting URL's.
start_requests() This method must return an iterable with the first
Requests to crawl for this spider.
Example:
class MySpider(BaseSpider):
name = "dozen"
allowed_domains = ["yahoo.com"]
stock = ["SCMP", "APPL", "GOOG"]
def start_requests(self):
BASE_URL = "http://finance.yahoo.com/q/is?s={}"
yield scrapy.Request(url=BASE_URL.format(s)) for s in self.stock
def parse(self, response):
# parse the responses here
pass
This way you also use a generator instead of a pre-generated list, which scales better in case of a large stock.
I would use a for loop, like this:
class MySpider(BaseSpider):
stock = ["SCMP", "APPL", "GOOG"]
name = "dozen"
allowed_domains = ["yahoo.com"]
def stock_list(stock):
start_urls = []
for i in stock:
start_urls.append("http://finance.yahoo.com/q/is?s={}".format(i))
return start_urls
start_urls = stock_list(stock)
Then assign the function call as I have at the bottom.
UPDATE
Using Scrapy 0.24
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
class MySpider(scrapy.Spider):
symbols = ["SCMP"]
name = "yahoo"
allowed_domains = ["yahoo.com"]
def stock_list(symbols):
start_urls = []
for symb in symbols:
start_urls.append("http://finance.yahoo.com/q/is?s={}&annual".format(symb))
return start_urls
start_urls = stock_list(symbols)
def parse(self, response):
revenue = Selector(response=response).xpath('//td[#align="right"]').extract()
print(revenue)
You may want to tweak the xpath to get exactly what you want; it seems to be pulling back a fair amount of stuff. But I've tested this and the scraping is working as expected.

How to crawl multiple pages in a single spider using scrapy

I need to fetch the urls of each product from this page http://www.stalkbuylove.com/new-arrivals/week-2.html#/page/1
and then need to fetch the details of each product from the product link. I am not sure how to do it.
import scrapy
import json
import redis
r_server = redis.Redis('localhost')
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["stalkbuylove.com"]
start_urls = [
"http://www.stalkbuylove.com/new-arrivals/week-2.html#/page/1"
]
def parse(self, response):
for sel in response.css('.product-detail-slide'):
name = sel.xpath('div/a/#title').extract()
price = sel.xpath('div/span/span/text()').extract()
productUrl = sel.xpath('div/a/#href').extract()
request = scrapy.Request(''.join(productUrl), callback=self.parseProductPage)
r_server.hset(name,"Name",name)
r_server.hset(name,"Price",price)
r_server.hset(name,"ProductUrl",productUrl)
print name, price, productUrl
def parseProductPage(self, response):
for sel in response.css('.top-details-product'):
availability = sel.xpath('div/link/#href').extract()
print availability
Can anyone help? When I got the product url how to crawl that url? Right now I am calling parseProductUrlPage which is not working.

Categories

Resources