Scrapy with selenium together the data of dynamic pages

Scrapy with selenium together the data of dynamic pages - python

I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break

Related

Want to remove some text from the line

I need only the address not need tel, Fax, Email When I run the code they give me the whole data but I want only the address this is page link https://all.accor.com/hotel/8392/index.de.shtml
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://all.accor.com/de/region/hotels-sachsen-dsn.shtml']
page_number = 0
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//a[#class='Teaser-link']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("//div[#class='infos__content']//p//text()")[:-3].getall()
address = [i.strip() for i in address]
# remove empty strings:
address = [i for i in address if i]
yield{
'name':title,
'address':address,
}

Your xpath selector for the address is wrong. You need to restrict the text you want to be from the first child of the div with a class of infos__content. Use the below code for the parse_book method and it should work.
def parse_book(self, response):
title=response.xpath("//h3//text()").get()
address = response.xpath("normalize-space(//div[#class='infos__content']/div[1]/p)").get()
address = address.replace("\xa0", " ")
yield{
'name':title,
'address':address,
}

Xpath not found using scrapy

I want to extract email and phone but I could not find the xpath for it. I would only retrieve the xpath of website, this is the link of the page where I extracted the data: https://www.fiduciairesuisse-vd.ch/directory/abc-gestion-sa
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link
}

Now,it's working.
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'pushpa'
start_urls = ['https://www.fiduciairesuisse-vd.ch/fr/adhesion/trouver-un-membre-partenaire-de-confiance?state=All&section=461&class=All&lang=All']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//td[#class='views-field views-field-title']//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
link = response.xpath("//a[#class='field__item link link--external']//#href").get()
yield{
'website':link,
'phone':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]//text()[2])').get(),
'email':response.xpath('normalize-space(//*[#class="s-mrgb-05 s-mrgt-05"]/div[1]//text()[2])').get()
}

Trouble outputting data with Scrapy

I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True

I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()

Scrapy does not fetch markup on response.css

I've built a simple scrapy spider running on scrapinghub:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def parse(self, response):
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
The problem I am facing is that the multiple_locs_url response.css returns an empty array despite me seeing it in the markup on the browser side.
I checked with scrapy shell and scrapy shell does not see the markup. I guess this is due to the markup being rendered through javascript when the page is loaded.
I added splash but that does not seem to apply to response. How would I make scrapy wait with the query until the page is loaded?

See source code for the page: view-source:pracuj.pl/praca/polska;ct,1 .
There is no element with class "offer-regions__label" in html code.
This code will always return an empty list:
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')
But as explained here https://stackoverflow.com/a/17697329/9913319:
Many times when crawling we run into problems where content that is
rendered on the page is generated with Javascript and therefore scrapy
is unable to crawl for it.
In this case you can use Selenium.
I changed your code and checked it and it works:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def __init__( self, **kwargs ):
super().__init__( **kwargs )
profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
firefox_binary = "pathToFirefoxBinary" # Must be the developer edition!!!
# self.driver = webdriver.Firefox()
self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
def parse(self, response):
self.driver.get( response.url )
elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
self.driver.get( response.url )
for element in elements:
print( "****" )
print( str( element.get_attribute( "href" ) ) )
print( str( element.text ) )
# your old code below
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}

Scrapy Nested request

I am using Python 3.5.2 and Scrapy 1.1.
There is a nested request in the demo below, specifically, in the page of article content, there is an ajax request which gets the author when he logs in.
I wrote it as follow, I can't get the author and I have a hard time understanding where the problem is.
demo:
# -*- coding: utf-8 -*-
import scrapy
from demo.items import ExampleItem
from scrapy.spiders import CrawlSpider
import re
class ExampleSpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/articles-list.php?page=1",
"http://www.example.com/articles-list.php?page=2",
"http://www.example.com/articles-list.php?page=3",
"http://www.example.com/articles-list.php?page=4",
"http://www.example.com/articles-list.php?page=5",
"http://www.example.com/articles-list.php?page=6",
]
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Cookie':'PHPSESSID=12345370000029b72333333dc999999; QS[uid]=100; QS[username]=example; QS[password]=example.com; QS[pmscount]=1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2774.3 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
def parse(self, response):
hrefs = response.xpath('a/#href')
for href in hrefs:
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_article_contents)
def parse_article_contents(self, response):
for sel in response.xpath('/html/body'):
item = ExampleItem()
item['articleUrl'] = response.url
item['title'] = sel.xpath('div[3]/a[2]/#href')[0].extract()
item['content'] = sel.xpath('div[2]/div[2]/div[1]/text()')[0].extract()
#In the page of artile content,there is an ajax request,which get the auther when login.
articleId = re.search(u'id=(\d{1,4})&', item['articleUrl']).group(1)
articleAuthorUrl = 'http://www.example.com/plus/ajax_author.php?id=' + articleId
#Crawling auther below.Is it correct?
def request_article_author(self):
return scrapy.Request(url=articleAuthorUrl,headers=headers,callback=self.parse_article_author)
def parse_article_author(self, response):
item['author'] = response.xpath('/html/body/div/div[1]/div[2]/text()').extract()
# item['author'] can be yielded var "yield item" below?
yield item
Any ideas?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy with selenium together the data of dynamic pages - python

Related

Want to remove some text from the line

Xpath not found using scrapy

Trouble outputting data with Scrapy

Scrapy does not fetch markup on response.css

Scrapy Nested request

Categories

Resources