Xpath not found using scrapy - python

I want to extract email and phone but I could not find the xpath for it. I would only retrieve the xpath of website, this is the link of the page where I extracted the data: https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link
}

Now,it's working.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}

Related

python Scrapy return nothing from the website

I want to crawl the website: https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3 user review part, but it return an empty result, the path of the selector is correct.
import scrapy
from scrapy import Selector,Request
class LaptopSpider(scrapy.Spider):
name = 'cs'
def start_requests(self):
url =' https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
yield Request(url,callback=self.parse)
def parse(self, response):
products_selector = response.css('#productRecap > div.p--z > div:nth-child(3) > div > div > div > div > div.Oi-z > div::text').get()
print(products_selector)
Try this to get the reviews from the link in your post:
import scrapy
class ZapposSpider(scrapy.Spider):
name = 'zappos'
link = 'https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
base_url = 'https://api.prod.cassiopeia.ugc.zappos.com/display/v2/reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
params = {
'offset': '0',
'page': '1',
'productId': '',
'sort': 'upVotes:desc,overallRating:desc,reviewDate:desc'
}
def start_requests(self):
product_id = self.link.split("product/")[1].split("/")[0]
self.params['productId'] = product_id
yield scrapy.FormRequest(
url=self.base_url,
headers=self.headers,
callback= self.parse,
method="GET",
formdata=self.params,
)
def parse(self, response):
for item in response.json()['reviews']:
reviewer = item['name']
review = item['summary']
yield {"reviewer":reviewer,"review":review}

Remove unnecessary url from scrapy

import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
print(url)
I want to remove these unnecessary url from the link the website is https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
http://www.unbr.ro
http://www.inppa.ro
http://www.uniuneanotarilor.ro/
http://www.caav.ro
http://www.executori.ro/
http://www.csm1909.ro
http://www.inm-lex.ro
http://www.just.ro
You can apply endswith method along with continue keyword to remove the desired urls
import scrapy
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
print(url)
Output:
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=1091&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159077&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159076&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159075&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159021&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159020&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159019&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=159018&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=21846&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=165927&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=83465&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=47724&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=32097&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=29573&Signature=378270
https://www.ifep.ro/justice/lawyers/LawyerFile.aspx?RecordId=19880&Signature=378270

Scrapy with selenium together the data of dynamic pages

I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break

Want to remove some text from the line

I need only the address not need tel, Fax, Email When I run the code they give me the whole data but I want only the address this is page link https://all.accor.com/hotel/8392/index.de.shtml
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://all.accor.com/de/region/hotels-sachsen-dsn.shtml']
page_number = 0
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//a[#class='Teaser-link']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("//div[#class='infos__content']//p//text()")[:-3].getall()
address = [i.strip() for i in address]
# remove empty strings:
address = [i for i in address if i]
yield{
'name':title,
'address':address,
}
Your xpath selector for the address is wrong. You need to restrict the text you want to be from the first child of the div with a class of infos__content. Use the below code for the parse_book method and it should work.
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("normalize-space(//div[#class='infos__content']/div[1]/p)").get()
address = address.replace("\xa0", " ")
yield{
'name':title,
'address':address,
}

Trouble outputting data with Scrapy

I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()

Categories

Resources