import scrapy
from multiple_pages.items import YieldItem
class YelpSpider(scrapy.Spider):
name = "yelp"
allowed_domains = ["yelp.com"]
start_urls = ('http://www.yelp.com/'
List item
)
def parse(self, response):
item =YieldItem()
item['restaurents'] = response.xpath('//span[#class="indexed-biz-name"]//text()').extract()
item['rating'] = response.xpath('//div[#class="rating-large"]').extract()
item['phonenumber'] = response.xpath('//span[#class="biz-phone"]//a//text()').extract()
print item
When you use // in your XPath it selects all nodes in the document from the current node that match the selection, no matter where they are. So I guess your selecting several text fields.
Try with something more specific like:
item['phonenumber'] = response.xpath('//span[#class="biz-phone"]/text()').extract()
Related
I am trying to figure out if my scrapy tool is correctly hitting the product_link for the request callback - 'yield scrapy.Request(product_link, callback=self.parse_new_item)'
product_link should be 'https://www.antaira.com/products/10-100Mbps/LNX-500A'
but I have not been able to confirm if my program is jumping into the next step created so that I can retrieve the correct yield return. Thank you!
# Import the required libraries
import scrapy
# Import the Item class with fields
# mentioned int he items.py file
from ..items import AntairaItem
# Spider class name
class productJumper(scrapy.Spider):
# Name of the spider
name = 'productJumper'
# The domain to be scraped
allowed_domains = ['antaira.com']
# The URLs to be scraped from the domain
start_urls = ['https://www.antaira.com/products/10-100Mbps']
#target_url = ['https://www.antaira.com/products/10-100Mbps/LNX-500A']
# First Step: Find every div with the class 'product-container' and step into the links
def parse(self, response):
#product_link = response.urljoin(rel_product_link)
# creating items dictionary
items = AntairaItem()
rel_product_link = response.css('div.center767')
for url in rel_product_link:
rel_product_link = response.xpath('//div[#class="product-container"]//a/#href').get(),
product_link = response.urljoin('rel_product_link'),
items['rel_product_link'] = rel_product_link,
items['product_link'] = product_link
#yield items
# 2nd Step: Return a list of the all products-links that will be scrapped
#yield {
# take the first relative product link
# 'rel_product_link' : rel_product_link,
# 'product_link' : product_link,
#}
yield scrapy.Request(product_link, callback=self.parse_new_item)
# Final Step: Run through each product and Yield the results
def parse_new_item(self, response):
for product in response.css('main.products'):
name = product.css(('h1.product-name::text').strip(' \t\n\r')).get()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
You have a couple of issues:
scrapy items are essentially dictionaries and are therefore mutable. You need to create a unique item for each and every yield statement.
your second parse callback is referencing a variable items that it doesn't have access too because it was defined in your first parse callback.
In your urljoin method you are using a string literal instead of a variable for rel_product_link
In the example below I fixed those issues and made some additional notes
import scrapy
from ..items import AntairaItem
class ProductJumper(scrapy.Spider): # classes should be TitleCase
name = 'productJumper'
allowed_domains = ['antaira.com']
start_urls = ['https://www.antaira.com/products/10-100Mbps']
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css(('h1.product-name::text').get().strip()
features = product.css('section.features h3 + ul').getall()
overview = product.css('.products .product-overview::text').getall()
main_image = product.css('div.selectors img::attr(src)').get()
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Can I scrape this with standard Scrapy or do I need to use Selenium?
The html is:
<td class="example"><sprite-svg name="EXAMPLE2"><svg><use
xlink:href="/spritemap/1_0_30#sprite-EXAMPLE2"></use></svg></sprite-svg></td>
I need the value "EXAMPLE2" somehow.
The xpath which works in the browser is //td[#class='example']//*[local-name() = 'svg']
When I put it into scrapy I use the following code but am getting XPATH error.
'example' : div.xpath(".//td[#class='example']//*[local-name() = 'svg']
()").extract()
Any ideas how to scrape it?
Looking at the table, each svg sprite is under a class 'rug_X'
Something like
import scrapy
class RaceSpider(scrapy.Spider):
name = 'race'
allowed_domains = ['thedogs.com.au']
start_urls = ['https://www.thedogs.com.au/racing/gawler/2020-07-07/1/the-bunyip-maiden-stake-pr2-division1']
item = {}
def parse(self, response):
row = response.xpath('//tbody/tr')
dog = a.xpath('.//td[#class="table__cell--tight race-runners__name"]/div/a/text()').get()
number = a.xpath('.//td[#class="table__cell--tight race-runners__box"]/sprite-svg/#name').get()
cleaned_num = int(number.replace('rug_',''))
grade = a.xpath('.//td[#class="race-runners__grade"]/text()').get()
item = {'grade':grade, 'greyhound':dog,'rug':cleaned_num}
yield item
You could also use item loaders with a custom function to clean up the response you get.
Yes. You can do it with scrapy :
response.xpath("//td[#class='table__cell--tight race-runners__box']/sprite-svg/#name").getall()
Working scrapy code :
import scrapy
class Test(scrapy.Spider):
name = 'Test'
start_urls = [
'https://www.thedogs.com.au/racing/gawler/2020-07-07/1/the-bunyip-maiden-stake-pr2-division1']
def parse(self, response):
return {"nameList": response.xpath("//td[#class='table__cell--tight race-runners__box']/sprite-svg/#name").getall()}
i'm trying to scrape this site using scrapy but returns all the value in a
single cell, i except each value in a different row.
example:
milage: 25
milage: 377
milage: 247433
milage: 464130
but i'm getting the data like this
example:
milage:[u'25',
u'377',
u'247433',
u'399109',
u'464130',
u'399631',
u'435238',
u'285000',
u'287470',
u'280000']
here is my code
import scrapy
from ..items import ExampleItem
from scrapy.selector import HtmlXPathSelector
url = 'https://example.com'
class Example(scrapy.Spider):
name = 'example'
allowed_domains = ['www.example.com']
start_urls = [url]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item_selector = hxs.select('//div[#class="listing_format card5 relative"]')
for fields in item_selector:
item = ExampleItem()
item ['Mileage'] = fields.select('//li[strong="Mileage"]/span/text()').extract()
yield item
You didn't show your site but may be you need relative XPath:
item ['Mileage'] = fields.select('.//li[strong="Mileage"]/span/text()').extract_first()
It sounds like you need to iterate over your milages.
for fields in item_selector:
milages = fields.select('//li[strong="Mileage"]/span/text()').extract()
for milage in milages:
item = CommercialtrucktraderItem()
item ['Mileage'] = milage
yield item
Also consider making your fields.select('//li[strong="Mileage"]/span/text()').extract() more specific?
I have a ScraPy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description, but once I add the extra method of parsing the contents, it fails to work. Here is the code:
class MonsterSpider(CrawlSpider):
name = "monster"
allowed_domains = ["jobs.monster.com"]
base_url = "http://jobs.monster.com/v-technology.aspx?"
start_urls = [
"http://jobs.monster.com/v-technology.aspx"
]
for i in range(1,5):
start_urls.append(base_url + "page=" + str(i))
rules = (Rule(SgmlLinkExtractor(allow=("jobs.monster.com",))
, callback = 'parse_items'),)
def parse_items(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="col-xs-12"]')
#items = []
for site in sites.xpath('.//article[#class="js_result_row"]'):
item = MonsterItem()
item['title'] = site.xpath('.//span[#itemprop = "title"]/text()').extract()
item['company'] = site.xpath('.//span[#itemprop = "name"]/text()').extract()
item['city'] = site.xpath('.//span[#itemprop = "addressLocality"]/text()').extract()
item['state'] = site.xpath('.//span[#itemprop = "addressRegion"]/text()').extract()
item['link'] = site.xpath('.//a[#data-m_impr_a_placement_id= "jsr"]/#href').extract()
follow = ''.join(item["link"])
request = Request(follow, callback = self.parse_dir_contents)
request.meta["item"] = item
yield request
#items.append(item)
#return items
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = site.xpath('.//div[#itemprop = "description"]/text()').extract()
return item
Taking out the parse_dir_contents and uncommenting the empty "lists" list and "append" code was the original code.
Well, as #tayfun suggests you should use response.xpath or define the site variable.
By the way, you do not need to use sel = Selector(response). Responses come with the xpath function, there is no need to cover it into another selector.
However the main problem is that you restrict the domain of the spider. You define allowed_domains = ["jobs.monster.com"] however if you look at the URL to follow of your custom Request you can see that they are something like http://jobview.monster.com/ or http://job-openings.monster.com. In this case your parse_dir_contents is not executed (the domain is not allowed) and your item does not get returned so you won't get any results.
Change allowed_domains = ["jobs.monster.com"] to
allowed_domains = ["monster.com"]
and you will be fine and your app will work and return items.
You have an error in your parse_dir_contents method:
def parse_dir_contents(self, response):
item = response.meta["item"]
item['desc'] = response.xpath('.//div[#itemprop=description"]/text()').extract()
return item
Note the use of response. I don't know where you got site that you are currently using from.
Also, try to provide the error details when you post a question. Writing "it fails to work" doesn't say much.
I'm trying to scrape a site driven by some user input. For example, the user gives me the pid of a product and a name, and a separate program will launch the spider, gather the data, and return it to the user.
However, the only information I want are product and person which are found in two links to an xml. If I know these two links and the pattern, how do I build the callback to parse the different items?
For example, if I have these two Items defined:
class PersonItem(Item):
name = Field()
...
class ProductItem(Item):
pid = Field()
...
And I know their links have pattern:
www.example.com/person/*<name_of_person>*/person.xml
www.example.com/*<product_pid>*/product.xml
Then my spider would look something like this:
class MySpider(BaseSpider):
name = "myspider"
# simulated given by user
pid = "4545-fw"
person = "bob"
allowed_domains = ["http://www.example.com"]
start_urls = ['http://www.example.com/person/%s/person.xml'%person, 'http://www.example.com/%s/product.xml'%pid]
def parse(self, response):
# Not sure here if scrapping person or item
I know that I can define rules too using Rule(SgmlLinkExtractor()) and then giving the person and product each its own parse callback. However, I'm not sure how they apply here since I think rules are meant for crawling deeper, whereas I only need to scrape the surface level.
If you want to be retro-active you could put your logic in parse():
def parse(self, response):
if 'person.xml' in response.url:
item = PersonItem()
elif 'product.xml' in response.url:
item = ProductItem()
else:
raise Exception('Could not determine item type')
UPDATE:
If you want to be pro-active you could override start_requests():
class MySpider(BaseSpider):
name = "myspider"
allowed_domains = ["example.com"]
pid = "4545-fw"
person = "bob"
def start_requests(self):
start_urls = (
('http://www.example.com/person/%s/person.xml' % self.person, PersonItem),
('http://www.example.com/%s/product.xml' % self.pid, ProductItem),
)
for url, cls in start_urls:
yield Request(url, meta=dict(cls=cls))
def parse(self, response):
item = response.meta['cls']()