Xpath not found using scrapy

Xpath not found using scrapy - python

I want to extract email and phone but I could not find the xpath for it. I would only retrieve the xpath of website, this is the link of the page where I extracted the data: https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link
}

Now,it's working.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}

Related

python Scrapy return nothing from the website

I want to crawl the website: https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3 user review part, but it return an empty result, the path of the selector is correct.
import scrapy
from scrapy import Selector,Request
class LaptopSpider(scrapy.Spider):
name = 'cs'
def start_requests(self):
url =' https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
yield Request(url,callback=self.parse)
def parse(self, response):
products_selector = response.css('#productRecap > div.p--z > div:nth-child(3) > div > div > div > div > div.Oi-z > div::text').get()
print(products_selector)

Try this to get the reviews from the link in your post:
import scrapy
class ZapposSpider(scrapy.Spider):
name = 'zappos'
link = 'https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
base_url = 'https://api.prod.cassiopeia.ugc.zappos.com/display/v2/reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
params = {
'offset': '0',
'page': '1',
'productId': '',
'sort': 'upVotes:desc,overallRating:desc,reviewDate:desc'
}
def start_requests(self):
product_id = self.link.split("product/")[1].split("/")[0]
self.params['productId'] = product_id
yield scrapy.FormRequest(
url=self.base_url,
headers=self.headers,
callback= self.parse,
method="GET",
formdata=self.params,
)
def parse(self, response):
for item in response.json()['reviews']:
reviewer = item['name']
review = item['summary']
yield {"reviewer":reviewer,"review":review}

Remove unnecessary url from scrapy

import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
print(url)
I want to remove these unnecessary url from the link the website is https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
http://www.unbr.ro
http://www.inppa.ro
http://www.uniuneanotarilor.ro/
http://www.caav.ro
http://www.executori.ro/
http://www.csm1909.ro
http://www.inm-lex.ro
http://www.just.ro

You can apply endswith method along with continue keyword to remove the desired urls
import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
print(url)
Output:
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=1091&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159077&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159076&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159075&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159021&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159020&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159019&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159018&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=21846&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=165927&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=83465&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=47724&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=32097&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=29573&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=19880&Signature=378270

Scrapy with selenium together the data of dynamic pages

I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break

Want to remove some text from the line

I need only the address not need tel, Fax, Email When I run the code they give me the whole data but I want only the address this is page link https://all.accor.com/hotel/8392/index.de.shtml
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://all.accor.com/de/region/hotels-sachsen-dsn.shtml']
page_number = 0
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//a[#class='Teaser-link']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("//div[#class='infos__content']//p//text()")[:-3].getall()
address = [i.strip() for i in address]
# remove empty strings:
address = [i for i in address if i]
yield{
'name':title,
'address':address,
}

Your xpath selector for the address is wrong. You need to restrict the text you want to be from the first child of the div with a class of infos__content. Use the below code for the parse_book method and it should work.
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("normalize-space(//div[#class='infos__content']/div[1]/p)").get()
address = address.replace("\xa0", " ")
yield{
'name':title,
'address':address,
}

Trouble outputting data with Scrapy

I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True

I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Xpath not found using scrapy - python

Related

python Scrapy return nothing from the website

Remove unnecessary url from scrapy

Scrapy with selenium together the data of dynamic pages

Want to remove some text from the line

Trouble outputting data with Scrapy

Categories

Resources