Scrapy trying to scrape business names href in python - python

I am trying to scrape the href for each business in yellowpages. I am very new to using scrapy and on my second day. I am using requests to get the actual url to search with the spider. What am I doing wrong with my code? I want to eventually have scrapy go to each business and scrape its address and other information.
# -*- coding: utf-8 -*-
import scrapy
import requests
search = "Plumbers"
location = "Hammond, LA"
url = "https://www.yellowpages.com/search"
q = {'search_terms': search, 'geo_location_terms': location}
page = requests.get(url, params=q)
page = page.url
class YellowpagesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['yellowpages.com']
start_urls = [page]
def parse(self, response):
self.log("I just visited: " + response.url)
items = response.css('span.text::text')
for items in items:
print(items)

To get the name use:
response.css('a[class=business-name]::text')
To get the href use:
response.css('a[class=business-name]::attr(href)')
In the final call this looks like:
for bas in response.css('a[class=business-name]'):
item = { 'name' : bas.css('a[class=business-name]::text').extract_first(),
'url' : bas.css('a[class=business-name]::attr(href)').extract_first() }
yield item
Result:
2018-09-13 04:12:49 [quotes] DEBUG: I just visited: https://www.yellowpages.com/search?search_terms=Plumbers&geo_location_terms=Hammond%2C+LA
2018-09-13 04:12:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.yellowpages.com/search?search_terms=Plumbers&geo_location_terms=Hammond%2C+LA>
{'name': 'Roto-Rooter Plumbing & Water Cleanup', 'url': '/new-orleans-la/mip/roto-rooter-plumbing-water-cleanup-21804163?lid=149760174'}
2018-09-13 04:12:49 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.yellowpages.com/search?search_terms=Plumbers&geo_location_terms=Hammond%2C+LA>
{'name': "AJ's Plumbing And Heating Inc", 'url': '/new-orleans-la/mip/ajs-plumbing-and-heating-inc-16078566?lid=1001789407686'}
...

Related

scrape alibaba.com with scrapy_playwright

I am new to scrapy and scrapy_playwright. I wants to scrape alibaba.com. I know this is javascript-based application I tried with playwright and i am still getting None
My code is
import scrapy
from scrapy_playwright.page import PageMethod
import scrapy_playwright.page
# from scrapy_playwright.page import page as Page
search_value = 'laptop'
class AwesomeSpider(scrapy.Spider):
name = "booby"
def start_requests(self):
yield scrapy.Request(
f"https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&tab=all&SearchText={search_value}",
meta=dict(
playwright=True,
playwrigt_include_page=True,
playwright_page_method=[
PageMethod('goto', f'https://www.alibaba.com/trade/search?fsb=y&IndexArea=product_en&CatId=&tab=all&SearchText={search_value}'),
PageMethod('wait_for_timeout', 60000), # Wait for 5 seconds
PageMethod('wait_for_selector', '.l-main-wrap' , 60000)
]
)
)
async def parse(self, response):
title = response.xpath("(//div[#class='vc-htmlTemplateContainer t_1676641830878']/div)[2]/a/#href")
yield{
'title': title.get() if title else None
}
And I am getting
Scraped from <200 https://validate.perfdrive.com/lens/block?ssa=a99795db-b981-8828-bca3-87daa6aa9c95&ssb=abbi136aiaf6h1bgfp4hlkd64&ssc=www.lens.org%2Flens%2Fsearch%2Fpatent%2Flist%3Fpreview%3Dtrue%26q%3Dlaptop&ssd=991269335285462&sse=ecocnjdofknojhk&ssf=c9ab3cee0bedb05301c0fa513f4afd8d0c073b59&ssg=6a10c953-5bf5-3717-a941-555e26c73c5a&ssh=3d97b6e9-edfc-6509-3f86-803a66e3824f&ssi=ab7c248f-9434-71eb-be09-91b719343e59&ssj=d737b4b9-8e6f-7cdc-401f-2c3e7ba62174&ssk=unblock#lens.org&ssl=091907579953&ssm=05577187421153844162625898798330&ssn=e65d854d7e69cacfa836c6379dd8578b0acec94d829b-afa3-9f69-9a348f&sso=ae754689-ba19bcebb8bc1c709129568e38e4e0f8c704a0146e07ae3a&ssp=39377651711676737153167677333225970&ssq=87505222168731697759721675278062951691586&ssr=MTIzLjEwOC45Mi4xMDg=&sss=Mozilla/5.0%20(compatible;%20Yahoo!%20Slurp;%20http://help.yahoo.com/help/us/ysearch/slurp)&sst=Scrapy/2.8.0%20(+https://scrapy.org)&ssu=Mozilla/5.0%20(compatible;%20Yahoo!%20Slurp;%20http://help.yahoo.com/help/us/ysearch/slurp)&ssv=ns4tmqpp2uoptpm&ssw=OTkzYzBhZWMtMWNmZi00ZDEzLThlZGYtODJjZDg0YzAyMTJm&ssx=979693301198084&ssy=glbbdodpoiabfgpcbcl#efcbol#mddb#mdjjcbed&ssz=5096d6fc1403c11>
{'related_link': None}
2023-02-18 17:01:29 [scrapy.core.engine] INFO: Closing spider (finished)

Scrapy Spider only pulling the first value from item container

I'm trying to scrape pricing info for comic books. What I'm ending up with is a Spider that scrapes through all instances of the top css selector, and then returns the desired value from only the first instance of the selector that contains the pricing info I'm after.
My end goal is to be able to create a pipeline to feed an SQLite db with title, sku, price, and url for the actual listing. Here is my code:
class XmenscrapeSpider(scrapy.Spider):
name = 'finalscrape'
allowed_domains = ['mycomicshop.com']
start_urls = ['https://www.mycomicshop.com/search?TID=222421']
def parse(self, response):
for item in response.css('td.highlighted'):
yield {
'title' : response.xpath('.//meta[#itemprop="sku"]/#content').get()
}
next_page = response.css('li.next a::attr(href)').extract()[1]
if next_page is not None:
yield resonse.follow(next_page, callback- self.parse)
My output looks like this:
{'title': '100 Bullets (1999 DC Vertigo) 1 CGC 9.8'}
2022-01-24 13:53:04 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.mycomicshop.com/search?TID=222421>
{'title': '100 Bullets (1999 DC Vertigo) 1 CGC 9.8'}
2022-01-24 13:53:04 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.mycomicshop.com/search?TID=222421>
{'title': '100 Bullets (1999 DC Vertigo) 1 CGC 9.8'}
2022-01-24 13:53:04 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.mycomicshop.com/search?TID=222421>
{'title': '100 Bullets (1999 DC Vertigo) 1 CGC 9.8'}
2022-01-24 13:53:04 [scrapy.core.scraper] DEBUG: Scraped from <200
https://www.mycomicshop.com/search?TID=222421>
{'title': '100 Bullets (1999 DC Vertigo) 1 CGC 9.8'}
If you look at the URL I'm trying to scrape, you can see that I'm only getting the desired value from the first tag, despite the spider iterating through the five instances of it on the page. I have a feeling that this is a simple solution, but I'm at whit's end here. Any ideas on what would probably be a simple fix?
You need to use relative xpath to item.
import scrapy
class XmenscrapeSpider(scrapy.Spider):
name = 'finalscrape'
allowed_domains = ['mycomicshop.com']
start_urls = ['https://www.mycomicshop.com/search?TID=222421']
def parse(self, response):
for item in response.css('td.highlighted'):
yield {
# 'title': response.xpath('.//meta[#itemprop="sku"]/#content').get()
'title': item.xpath('.//meta[#itemprop="name"]/#content').get()
}
next_page = response.css('li.next a::attr(href)').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
Note: You only loop over the highlighted items, and since the next page doesn't have any you won't get anything from it.

item loader skip values scrapy

I'm using item loader with scrapy from multiple page, the item loader returns empty dictionaries for some pages though when i use same rules to parse only these pages it returns the values, anyone could know why?
spider code:
class AllDataSpider(scrapy.Spider):
name = 'all_data' # spider name
allowed_domains = ['amazon.com']
# write the start url
start_urls = ["https://www.amazon.com/s? bbn=2619533011&rh=n%3A2619533011%2Cp_n_availability%3A2661601011&ie=UTF8&qid =1541604856&ref=lp_2619533011_nr_p_n_availability_1"]
custom_settings = {'FEED_URI': 'pets_.csv'} # write csv file name
def parse(self, response):
'''
function parses item information from category page
'''
self.category = response.xpath('//span[contains(#class, "nav-a-
content")]//text()').extract_first()
urls = response.xpath('//*[#data-asin]//#data-asin').extract()
for url in urls:
base = f"https://www.amazon.com/dp/{url}"
yield scrapy.Request(base, callback=self.parse_item)
next_page = response.xpath('//*
[text()="Next"]//#href').extract_first()
if next_page is not None:
yield scrapy.Request(response.urljoin(next_page),
dont_filter=True)
def parse_item(self, response):
loader = AmazonDataLoader(selector=response)
loader.add_xpath("Availability", '//div[contains(#id,
"availability")]//span//text()')
loader.add_xpath("NAME", '//h1[#id="title"]//text()')
loader.add_xpath("ASIN", '//*[#data-asin]//#data-asin')
loader.add_xpath("REVIEWS", '//span[contains(#id,
"Review")]//text()')
rank_check = response.xpath('//*[#id="SalesRank"]//text()')
if len(rank_check) > 0:
loader.add_xpath("RANKING", '//*[#id="SalesRank"]//text()')
else:
loader.add_xpath("RANKING", '//span//span[contains(text(), "#")]
[1]//text()')
loader.add_value("CATEGORY", self.category)
return loader.load_item()
for some pages it returns all values, for some pages it returns just the category, and for other "that follow same rules when parsing them only" it returns nothing, it also close the spider before finishing and without errors
DEBUG: Scraped from <200 https://www.amazon.com/dp/B0009X29WK>
{'ASIN': 'B0009X29WK',
'Availability': 'In Stock.',
'NAME': " Dr. Elsey's Cat Ultra Premium Clumping Cat Litter, 40 pound bag ( "
'Pack May Vary ) ',
'RANKING': '#1',
'REVIEWS': '13,612'}
2019-01-21 21:13:07 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.amazon.com/dp/B01N9KSITZ> (referer: https://www.amazon.com/s?i=pets&bbn=2619533011&rh=n%3A2619533011%2Cp_n_availability%3A2661601011&lo=grid&page=2&ie=UTF8&qid=1548097190&ref=sr_pg_1)
2019-01-21 21:13:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.amazon.com/dp/B01N9KSITZ>
{}

Scrapy links crawled but not scraped

I have made a scraper to crawl all categories related to "au-quotidien", on e-commerce website Cdiscount.
The bot is supposed to start on the top menu, then accessing a second layer deep, then a third, and scrape items. Here is my code, as a test :
class CdiscountSpider(scrapy.Spider):
name = "cdis_bot" # how we have to call the bot
start_urls = ["https://www.cdiscount.com/au-quotidien/v-127-0.html"]
def parse(self, response):
for link in response.css('div.mvNavSub ul li a::attr(href)').extract():
regex_top_category = r"\b(?=\w)" + re.escape("au-quotidien") + r"\b(?!\w)"
if re.search(regex_top_category, link):
yield response.follow(link, callback = self.parse_on_categories) #going to one layer deep from landing page
def parse_on_categories(self, response):
for link in response.css('div.mvNavSub ul li a::attr(href)').extract():
yield response.follow(link, callback = self.parse_on_subcategories) #going to two layer deep from landing page
def parse_on_subcategories(self, response):
for link in response.css('div.mvNavSub ul li a::attr(href)').extract():
yield response.follow(link, callback = self.parse_data) #going to three layer deep from landing page
def parse_data(self, response):
links_list = response.css("div.prdtBILDetails a::attr(href)").extract()
regex_ean = re.compile(r'(\d+)\.html')
eans_list = [regex_ean.search(link).group(1) for link in links_list if regex_ean.search(link)]
desc_list = response.css("div.prdtBILTit::text").extract()
price_euros = response.css("span.price::text").extract()
price_cents = response.css("span.price sup::text").extract()
for euro, cent, ean, desc in zip(price_euros, price_cents, eans_list, desc_list):
if len(ean) > 6:
yield{'ean':ean,'price':euro+cent,'desc':desc,'company':"cdiscount",'url':response.url}
My problem is that, only links are retrieved.
For instance :
2018-12-18 14:40:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cdiscount.com/au-quotidien/alimentaire/pates-riz-/legumes-secs/l-127015303.html> (referer: https://www.cdiscount.com/au-quotidien/alimentaire/pates-riz-/l-1270153.html)
2018-12-18 14:40:41 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.cdiscount.com/au-quotidien/alimentaire/pates-riz-/semoules/l-127015302.html> (referer: https://www.cdiscount.com/au-quotidien/alimentaire/pates-riz-/l-1270153.html)
But I get only a very few scraped items, always on the same category, like this :
{'ean': '2009818241269', 'price': '96€00', 'desc': 'Heidsieck & Co Monopole 75cl x6', 'company': 'cdiscount', 'url': 'https://www.cdiscount.com/vin-champagne/vin-champagne/champagne-brut/l-1293402.html'}
2018-12-18 14:40:34 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.cdiscount.com/vin-champagne/vin-champagne/champagne-brut/l-1293402.html>
While it seems to me that other categories share the same items selector.
If you could help me to figure out where I am wrong I would be grateful :) thank you
It looks like the responses your parse_data() method is receiving are all vastly different.
For example, these are the first three urls it parses on a sample run:
https://www.cdiscount.com/vin-champagne/vin-champagne/champagne-millesime/l-1293404.html
https://www.cdiscount.com/vin-champagne/coffrets-cadeaux/v-12960-12960.html
https://www.cdiscount.com/au-quotidien/alimentaire/bio/boisson-bio/jus-de-tomates-bio/l-12701271315.html
It's obvious (even from a quick glance) that the structure of each of these pages is different.
In most cases, your eans_list and desc_list are empty, so the zip() call produces no results.

Python Scrapy for grabbing table columns and rows

I'm relatively a noob at python and it's my first time learning scrapy. I've done data mining with perl quite successfully before, but this is a whole different ballgame!
I'm trying to scrape a table, grab the columns of each row. My code is below.
items.py
from scrapy.item import Item, Field
class Cio100Item(Item):
company = Field()
person = Field()
industry = Field()
url = Field()
scrape.py (the spider)
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from cio100.items import Cio100Item
items = []
class MySpider(BaseSpider):
name = "scrape"
allowed_domains = ["cio.co.uk"]
start_urls = ["http://www.cio.co.uk/cio100/2013/cio/"]
def parse(self, response):
sel = Selector(response)
tables = sel.xpath('//table[#class="bgWhite listTable"]//h2')
for table in tables:
# print table
item = Cio100Item()
item['company'] = table.xpath('a/text()').extract()
item['person'] = table.xpath('a/text()').extract()
item['industry'] = table.xpath('a/text()').extract()
item['url'] = table.xpath('a/#href').extract()
items.append(item)
return items
I'm have some trouble understanding how to articulate the xpath selection correctly.
I think this line is the problem:
tables = sel.xpath('//table[#class="bgWhite listTable"]//h2')
When I run the scraper as is above the result is I get things like this in terminal:
2014-01-13 22:13:29-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/>
{'company': [u"\nDomino's Pizza\n"],
'industry': [u"\nDomino's Pizza\n"],
'person': [u"\nDomino's Pizza\n"],
'url': [u'/cio100/2013/dominos-pizza/']}
2014-01-13 22:13:29-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/>
{'company': [u'\nColin Rees\n'],
'industry': [u'\nColin Rees\n'],
'person': [u'\nColin Rees\n'],
'url': [u'/cio100/2013/dominos-pizza/']}
Ideally I want only one block, not two, with Domino's in the company slot, Colin in the person slot, and the industry grabbed, which it's not doing.
When I use firebug to inspect the table, I see h2 for columns 1 and 2 (company and person) but column 3 is h3?
When I modify the tables line to h3 at the end, as follows
tables = sel.xpath('//table[#class="bgWhite listTable"]//h3')
I get this
2014-01-13 22:16:46-0500 [scrape] DEBUG: Scraped from <200 http://www.cio.co.uk/cio100/2013/cio/>
{'company': [u'\nRetail\n'],
'industry': [u'\nRetail\n'],
'person': [u'\nRetail\n'],
'url': [u'/cio100/2013/dominos-pizza/']}
Here it only produces 1 block, and it's capturing Industry and the URL correctly. But it's not getting the company name or person.
Any help will be greatly appreciated!
Thanks!
as far as the xpath goes, consider doing something like:
$ scrapy shell http://www.cio.co.uk/cio100/2013/cio/
...
>>> for tr in sel.xpath('//table[#class="bgWhite listTable"]/tr'):
... item = Cio100Item()
... item['company'] = tr.xpath('td[2]//a/text()').extract()[0].strip()
... item['person'] = tr.xpath('td[3]//a/text()').extract()[0].strip()
... item['industry'] = tr.xpath('td[4]//a/text()').extract()[0].strip()
... item['url'] = tr.xpath('td[4]//a/#href').extract()[0].strip()
... print item
...
{'company': u'LOCOG',
'industry': u'Leisure and entertainment',
'person': u'Gerry Pennell',
'url': u'/cio100/2013/locog/'}
{'company': u'Laterooms.com',
'industry': u'Leisure and entertainment',
'person': u'Adam Gerrard',
'url': u'/cio100/2013/lateroomscom/'}
{'company': u'Vodafone',
'industry': u'Communications and IT services',
'person': u'Albert Hitchcock',
'url': u'/cio100/2013/vodafone/'}
...
other than that you better yield items one by one rather than accumulating them in a list

Categories

Resources