Xpath selection only returns first response result - python

I'm still new to scrapy. When trying to read data from quotes.toscrape, I don't get any content back when using xpath selectors. As soon as I use css selectors everything works as intended. I just can't find the error even though the example is super simple.
quotes.py
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selektor
item['author_name'] = quote.xpath('//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
items.py
import scrapy
from scrapy.loader import ItemLoader
class QuotesLoaderItem(scrapy.Item):
# define the fields for your item here like:
author_name = scrapy.Field()
quote_text = scrapy.Field()
author_link = scrapy.Field()
tags = scrapy.Field()
Result
author_name,quote_text,author_link,tags
Albert Einstein,“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”,/author/Albert-Einstein,change
Albert Einstein, ...
...
(20 times)
thank you for your commitment

I use a selector object instead of a respons object and therefore the syntax has to look like this.
import scrapy
from quotes_loader.items import QuotesLoaderItem as QL
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = [
'http://quotes.toscrape.com//']
def parse(self, response):
item = QL()
quotes = response.xpath('//div[#class="quote"]')
for quote in quotes:
# CSS-Selector
# item['author_name'] = quote.css('small.author::text').get()
# item['quote_text'] = quote.css('span.text::text').get()
# item['author_link'] = quote.css('small.author + a::attr(href)').get()
# item['tags'] = quote.css('div.tags > a.tag::text').get()
# XPATH-Selector
item['author_name'] = quote.xpath('.//small[#class="author"]/text()').get()
item['quote_text'] = quote.xpath('.//span[#class="text"]/text()').get()
item['author_link'] = quote.xpath('.//small[#class="author"]/following-sibling::a/#href').get()
item['tags'] = quote.xpath('.//*[#class="tags"]/*[#class="tag"]/text()').get()
yield item
# next_page_url = response.css('li.next > a::attr(href)').get()
next_page_url = response.xpath('.//*[class="next"]/a/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)

Related

Scrapy file, only running the initial start_urls instead of running though the whole list

As the title states, I am trying to run my scrapy program, the issue I am running into is that it seems to be only returning the yield from the initial url (https://www.antaira.com/products/10-100Mbps).
I am unsure on where my program is not working, in my code I have also left some commented code on what I have attempted.
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
start_urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit'
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE'
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE'
'https://www.antaira.com/products/Unmanaged-10-gigabit'
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE'
]
#def start_requests(self):
# yield scrappy.Request(start_urls, self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items
Thank you everyone!
Follow up question, for some reason when I run "scrapy crawl productJumperFix" im not getting any output from the terminal,not sure how to debug since I can't even see the output errors.
Try using the start_requests method:
For example:
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider):
name = 'productJumperFix'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
for url in response.xpath('//div[#class="product-container"]//a/#href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem()
items['product_link'] = response.url
name = product.css('h1.product-name::text').get().strip()
features = product.css(('section.features h3 + ul').strip()).getall()
overview = product.css('.products .product-overview::text').getall()
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/#src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['name'] = name,
items['features'] = features,
items['overview'] = overview,
items['main_image'] = main_image,
items['rel_links'] = rel_links,
yield items

How to scrape the data upto n levels using Scrapy

I am new to scrapy and this is my first try in web scraping. Structure of the webpage fro which I am trying to scrape is following:
level 0: Main company URL ---> level 1: several associated company URLs ----> level 2: each associated company URL in level 1 has many URLs linked ---> ... upto level n
Right now I can scrape data upto level 1. But I want to do it upto n th level recursively. There should be a control like max_depth upto which I want to scrape.
I can not figure out how to do it.
Here is my spider which I wrote so far:
import scrapy
from ..items import *
class NodeSpider(scrapy.Spider):
name = 'nodes'
start_urls = ['https://www.zaubacorp.com/companysearchresults/DOIT-']
base_url = 'https://www.zaubacorp.com/'
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}
def parse(self, response):
search_links = response.xpath('//table[#id="results"]/tr/td/a[contains(#href,"company/DOIT-URBAN")]/#href').getall()
page_list = search_links[1:]
#url = search_links.pop(0)
check_list = []
for url in search_links:
print("func 1")
yield response.follow(url=url, callback=self.parse_doit,meta={'page_list':page_list,
'check_list':check_list
})
def parse_doit(self, response):
print("func 2")
check_list = response.meta['check_list']
lnk = MainLink()
lnk['name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
lnk['url'] = response.url
lnk['address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
lnk['email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
lnk['director1'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
lnk['director2'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
dir1_same_co_list = response.xpath('//*[#id="accordion1"]/table[1]//td//p/a/#href').getall()
dir2_same_co_list = response.xpath('//*[#id="accordion2"]/table[1]//td//p/a/#href').getall()
co_list = dir1_same_co_list + list(set(dir2_same_co_list)-set(dir1_same_co_list))
dir_same_co_list = list(set(co_list)-set(check_list))
check_list = check_list + list(set(dir_same_co_list)-set(check_list))
page_list = response.meta['page_list']
if dir1_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
def parse_level_2(self,response):
print("func 3")
lnk = response.meta['name']
lnk = response.meta['url']
lnk = response.meta['address']
lnk = response.meta['email']
lnk = response.meta['director1']
lnk = response.meta['director2']
page_list = response.meta['page_list']
#next_page = response.meta['next_page']
level_2 = SecondaryLink()
try:
lnk['Company_Details_W_Same_Directors']
except:
lnk['Company_Details_W_Same_Directors'] = []
#for sub_link in dir1_same_co_list:
level_2['Co_Name'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[1]/div[1]/table/tbody/tr[1]/td[2]/p//text()').get()
level_2['Co_url'] = response.url
level_2['Address'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[4]/text()').get()
level_2['Email'] = response.xpath('//*[#id="block-system-main"]/div[2]/div[1]/div[6]//p[1]/text()').get()
level_2['First_Director'] = response.xpath('//*[#id="package1"]/td[2]//p//text()').get()
level_2['Second_Director'] = response.xpath('//*[#id="package2"]/td[2]//p//text()').get()
lnk['Company_Details_W_Same_Directors'].append(level_2)
dir_same_co_list = response.meta['dir_same_co_list']
print("===== start reading co list =====")
if dir_same_co_list:
next_url_dir = dir_same_co_list.pop(0)
print("co list",len(dir_same_co_list))
yield response.follow(url = next_url_dir, callback = self.parse_level_2,
meta = {'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'dir_same_co_list':dir_same_co_list,
'page_list':page_list
})
else:
if page_list:
print("next page loop")
next_page = page_list.pop(0)
next_page_url = next_page
yield response.follow(url=next_page_url, callback=self.parse_doit, meta={'name':lnk,
'url':lnk,
'address':lnk,
'email':lnk,
'director1':lnk,
'director2':lnk,
'next_page':next_page,
'page_list':page_list})
else:
yield lnk
and the items.py is following:
class MainLink(scrapy.Item):
name = scrapy.Field()
url = scrapy.Field()
address = scrapy.Field()
email = scrapy.Field()
director1 = scrapy.Field()
Company_Details_W_Same_Directors = scrapy.Field()
director2 = scrapy.Field()
pass
class SecondaryLink(scrapy.Item):
Co_Name = scrapy.Field()
Co_url = scrapy.Field()
Address = scrapy.Field()
Email = scrapy.Field()
First_Director = scrapy.Field()
Second_Director = scrapy.Field()
pass ```
Help is much appreciated
You can make use of the DEPTH_LIMIT in scrapy. Please see https://docs.scrapy.org/en/latest/topics/settings.html#depth-limit

Scrapy not able to scrape for the next page

I wanted to scrape the information for the following pages, however, the code only allows me to scrape the information from the first page.
My code is as follows:
# -*- coding: utf-8 -*-
import scrapy
from ..items import PropertyItem
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.com']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
item = PropertyItem ()
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
That's all about your allowed_domains (but you need to fix your indent too). Also I'm sure that you want to define your item inside your loop:
class Starprop(scrapy.Spider):
name = 'starprop'
allowed_domains = ['starproperty.my']
start_urls = ['https://www.starproperty.my/to-buy/search?max_price=1000000%2B&new_launch_checkbox=on&sub_sales_checkbox=on&auction_checkbox=on&listing=For%20Sale&sort=latest&page=1']
def parse(self, response):
property_list = response.css('.mb-4 div')
for property in property_list:
property_name = property.css ('.property__name::text').extract()
property_price = property.css('.property__price::text').extract()
property_location = property.css ('.property__location::text').extract()
property_agent = property.css('.property__agentdetails .property__agentdetails span:nth-child(1)::text').extract()
property_phone = property.css ('.property__agentcontacts a span::text').extract()
item = PropertyItem ()
item['property_name']= property_name
item['property_price']= property_price
item['property_location'] = property_location
item['property_agent'] = property_agent
item['property_phone'] = property_phone
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page:
yield response.follow(next_page, callback = self.parse)
maybe due to indent?
try change:
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)
to
yield item
next_page = response.css('.page-item:nth-child(10) .page-link::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)

Get the Type category for the URLs using scrapy

For this URL , I need all the product URLs and their respective TYPE.
So the output should be:
Product_URL1 Blouse
Product_URL2 Crop Top
Product_URL3 Tank Top
Product_URL4 Strappy Top
Product_URL5 Tube Top
Below is my code, I guess everything is right expect the xpath for the item['type']
from scrapy.spiders import CrawlSpider
import scrapy
from scrapy.http.request import Request
class JabongItem(scrapy.Item):
base_link = scrapy.Field()
type = scrapy.Field()
count = scrapy.Field()
product_name = scrapy.Field()
product_link = scrapy.Field()
class JabongScrape(CrawlSpider):
name = "jabong"
allowed_domains = "jabong.com"
start_urls = ["http://www.jabong.com/women/clothing/tops-tees-shirts/tops", "http://www.jabong.com/women/clothing/tops-tees-shirts/tees"]
def parse(self, response):
item=JabongItem()
try:
for idx in range(0, 20):
item['type']=response.xpath("//div[contains(#class, 'options')]/label/a/text()").extract()[idx]
item['base_link']=response.url+response.xpath("//div[contains(#class, 'options')]/label/a/#href").extract()[idx] + "?ax=1&page=1&limit=" + (response.xpath("//div[contains(#class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","") + "&sortField=popularity&sortBy=desc"
item['count']= (response.xpath("//div[contains(#class, 'options')]/label/small/text()").extract()[idx]).replace("[","").replace("]","")
yield Request(item['base_link'],callback=self.parse_product_link,
meta={'item': item, 'count': int(item['count'])}, dont_filter=True)
except:
pass
def parse_product_link(self,response):
item=response.meta['item']
try:
for i in range(0, response.meta['count']):
item['product_link']=response.xpath("//div[contains(#class, 'col-xxs-6 col-xs-4 col-sm-4 col-md-3 col-lg-3 product-tile img-responsive')]/a/#href").extract()[i]
# item['original_price']=response.xpath("section.row > div:nth-child(1) > a:nth-child(1) > div:nth-child(2) > div:nth-child(2) > span:nth-child(1) > span:nth-child(1)::text").extract()[idx]
print i
yield item
except:
pass
And the jbng_base_links.txt contains "http://www.jabong.com/women/clothing/tops-tees-shirts/tops"
As Rafael pointed out the easiest way of doing this is simply restructuring your spider manually to follow this order:
Go to webpage
Find type urls
Go to every type url -> scrape items
It could be as simple as:
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = []
def parse(self, response):
"""this will parse landing page for type urls"""
urls = response.xpath("//div[contains(text(),'Type')]/..//a/#href").extract()
for url in urls:
url = response.urljoin(url)
yield scrapy.Requests(url, self.parse_type)
def parse_type(self, response):
"""this will parse every type page for items"""
type_name = response.xpath("//a[#class='filtered-brand']/text()").extract_first()
product_urls = ...
for url in product_urls:
yield {'type': type_name, 'url': url}
# handle next page

Scrapy Webcrawler and Data Extractor

I am trying to create a webcrawler with scrapy, i am using a template that i have used before but i cant seem to get it to parse the urls. I can see it go to youtube and then go to the watchpage but from there it wont pull the title or desciptions or anything because it always fails to parse.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
from krakenkrawler.items import KrakenItem
class AttractionSpider(CrawlSpider):
name = "thekraken"
allowed_domains = ["youtube.com"]
start_urls = [
"http://www.youtube.com/?gl=GB&hl=en-GB"
]
rules = ()
def __init__(self, name=None, **kwargs):
super(AttractionSpider, self).__init__(name, **kwargs)
self.items_buffer = {}
self.base_url = "http://www.youtube.com"
from scrapy.conf import settings
settings.overrides['DOWNLOAD_TIMEOUT'] = 360
def parse(self, response):
print "Start scrapping Attractions...."
try:
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[#class='yt-lockup-title']//a/#href")
if not links:
return
log.msg("No Data to scrap")
for link in links:
v_url = ''.join( link.extract() )
if not v_url:
continue
else:
_url = self.base_url + v_url
yield Request( url= _url, callback=self.parse_details )
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
def parse_details(self, response):
print "Start scrapping Detailed Info...."
try:
hxs = HtmlXPathSelector(response)
l_venue = KrakenItem()
v_name = hxs.select("//*[#id='eow-title'].text").extract()
if not v_name:
v_name = hxs.select("//*[#id='eow-title'].text").extract()
l_venue["name"] = v_name[0].strip()
base = hxs.select("//*[#id='content']/div[7]")
if base.extract()[0].strip() == "<div style=\"clear:both\"></div>":
base = hxs.select("//*[#id='content']/div[8]")
elif base.extract()[0].strip() == "<div style=\"padding-top:10px;margin-top:10px;border-top:1px dotted #DDD;\">\n You must be logged in to add a tip\n </div>":
base = hxs.select("//*[#id='content']/div[6]")
x_datas = base.select("div[1]/b").extract()
v_datas = base.select("div[1]/text()").extract()
i_d = 0;
if x_datas:
for x_data in x_datas:
print "data is:" + x_data.strip()
if x_data.strip() == "<b>Address:</b>":
l_venue["address"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Contact:</b>":
l_venue["contact"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Operating Hours:</b>":
l_venue["hours"] = v_datas[i_d].strip()
if x_data.strip() == "<b>Website:</b>":
l_venue["website"] = (base.select("//*[#id='watch-actions-share-panel']/div/div[2]/div[2]/span[1]/input/text()").extract())[0].strip()
i_d += 1
v_photo = base.select("img/#src").extract()
if v_photo:
l_venue["photo"] = v_photo[0].strip()
v_desc = base.select("div[3]/text()").extract()
if v_desc:
desc = ""
for dsc in v_desc:
desc += dsc
l_venue["desc"] = desc.strip()
v_video = hxs.select("//*[#id='content']/iframe/#src").extract()
if v_video:
l_venue["video"] = v_video[0].strip()
yield l_venue
except Exception as e:
log.msg("Parsing failed for URL {%s}"%format(response.request.url))
raise
Thanks a ton in advance.
The problem is that the structure you are looking for "//h3[#class='yt-lockup-title']//a/#href" is not present in all pages.
I modified your code to verify what pages are opened and what data are extracted:
class AttractionSpider(CrawlSpider):
name = "thekraken"
bot_name = 'kraken'
allowed_domains = ["youtube.com"]
start_urls = ["http://www.youtube.com/?gl=GB&hl=en-GB"]
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
print "Start scrapping Attractions...."
print response.url
try :
hxs = HtmlXPathSelector(response)
links = hxs.select("//h3[#class='yt-lockup-title']//a/#href")
for link in links:
v_url = ''.join( link.extract() )
print v_url
if not links:
log.msg("No Data to scrap")
except :
pass
Result is something like this:
Start scrapping Attractions....http://www.youtube.com/watch?v=GBdCbciGLK0
Start scrapping Attractions....http://www.youtube.com/watch?v=BxUjDpnSHyc&list=TL4PEfm95Wz3k
Start scrapping Attractions.... http://www.youtube.com/watch?v=T-CZW4YjAig
Start scrapping Attractions....
https://www.youtube.com/user/ComedyShortsGamer
/watch?v=TdICODRvAhc&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=CDGzm5edrlw&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=F2oR5KS54JM&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=LHRzOIvqmQI&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=F4iqiM6h-2U&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=ug3UPIvWlvU&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=msiZs6lIZ9w&list=UUrqsNpKuDQZreGaxBL_a5Jg
/watch?v=Jh6A3DoOLBg&list=UUrqsNpKuDQZreGaxBL_a5Jg
In the inner pages where no results are scraped there are no "yt-lockup-title" classes.
In brief you have to improve your spider.

Categories

Resources