I do not anything how to scrape ajax pages there is no pagination on website the website will be load by clicking the load more button these is the page link https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false
import scrapy
from scrapy.http import Request
from selenium import webdriver
from scrapy_selenium import SeleniumRequest
import pandas as pd
class TestSpider(scrapy.Spider):
name = 'test'
def start_requests(self):
yield SeleniumRequest(
url="https://aaos22.mapyourshow.com/8_0/explore/exhibitor-gallery.cfm?featured=false",
wait_time=3,
screenshot=True,
callback=self.parse,
dont_filter=True
)
def parse(self, response):
books = response.xpath("//h3[#class='card-Title\nbreak-word\nf3\nmb1\nmt0']//a//#href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
title = response.css(".mr3-m::text").get()
address = response.css(".showcase-address::text").get()
address=address.strip()
website = response.xpath("//li[#class='dib ml3 mr3']//a[starts-with(#href, 'http')]/#href").get()
website=website.strip()
phone = response.xpath("//li[#class='dib ml3 mr3'] //span[contains(text(), 'Phone:')]/following-sibling::text()").get()
phone=phone.strip().replace("-","")
yield{
'title':title,
'address':address,
'website':website,
'phone':phone
}
Okay, try the following script to get all the fields you wish to grab from there traversing all the exhibitor list:
import scrapy
from scrapy.selector import Selector
class MapYourShowSpider(scrapy.Spider):
name = "mapyourshow"
content_url = 'https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm'
inner_base = 'https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={}'
headers = {
'x-requested-with': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '557',
'start': '0',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.content_url,
method='GET',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
for item in response.json()['DATA']['results']['exhibitor']['hit']:
inner_link = self.inner_base.format(item['fields']['exhid_l'])
yield scrapy.Request(
url=inner_link,
headers=self.headers,
callback=self.parse_content,
)
def parse_content(self,response):
elem = response.json()['DATA']['BODYHTML']
sel = Selector(text=elem)
title = sel.css("h2::text").get()
try:
address = ' '.join([' '.join(i.split()) for i in sel.css("p.showcase-address::text").getall()])
except AttributeError: address = ""
website = sel.css("a[title*='website']::text").get()
phone = sel.xpath("normalize-space(//*[starts-with(#class,'showcase-web-phone')]/li[./*[.='Phone:']]/span/following::text())").get()
yield {"title":title,"address":address,"website":website,"phone":phone}
I have not used your code and did it rather my way (because I'm not a huge fan of selenium). But I hope this helps anyway:
import requests
import json
import time
from bs4 import BeautifulSoup
import re
headers = {
'x-requested-with': 'XMLHttpRequest',
}
params = {
'action': 'search',
'searchtype': 'exhibitorgallery',
'searchsize': '200', # don`t increase this too much (increase the start parameter instead and send a new request after some delay)
'start': '0',
}
response = requests.get('https://aaos22.mapyourshow.com/8_0/ajax/remote-proxy.cfm', params=params, headers=headers)
data = json.loads(response.text)
all_sites = []
for exs in data["DATA"]["results"]["exhibitor"]["hit"]:
id = exs["fields"]["exhid_l"]
site = f"https://aaos22.mapyourshow.com/8_0/exhibitor/exhibitor-details.cfm?exhid={id}"
all_sites.append(site)
for site in all_sites:
response = requests.get(site)
soup = BeautifulSoup(response.text, "html.parser")
info_box = soup.find("div", {"id":"showroomContentDiv"})
title = info_box.find("section", {"id":"scroll-description"}).text.strip().split("\n")[0][6:]
address = " ".join(info_box.find("p", {"class":"showcase-address"}).text.strip().split())
website = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[0].text.strip()
phone = info_box.find("ul", {"class":"showcase-web-phone"}).find_all("li")[1].text[7:].strip()
print(title)
print(address)
print(website)
print(phone)
# delay so you don't create too much traffic
time.sleep(1)
Related
I want to crawl the website: https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3 user review part, but it return an empty result, the path of the selector is correct.
import scrapy
from scrapy import Selector,Request
class LaptopSpider(scrapy.Spider):
name = 'cs'
def start_requests(self):
url =' https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
yield Request(url,callback=self.parse)
def parse(self, response):
products_selector = response.css('#productRecap > div.p--z > div:nth-child(3) > div > div > div > div > div.Oi-z > div::text').get()
print(products_selector)
Try this to get the reviews from the link in your post:
import scrapy
class ZapposSpider(scrapy.Spider):
name = 'zappos'
link = 'https://www.zappos.com/p/lamade-mozza-halter-pullover-black/product/9796103/color/3'
base_url = 'https://api.prod.cassiopeia.ugc.zappos.com/display/v2/reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
params = {
'offset': '0',
'page': '1',
'productId': '',
'sort': 'upVotes:desc,overallRating:desc,reviewDate:desc'
}
def start_requests(self):
product_id = self.link.split("product/")[1].split("/")[0]
self.params['productId'] = product_id
yield scrapy.FormRequest(
url=self.base_url,
headers=self.headers,
callback= self.parse,
method="GET",
formdata=self.params,
)
def parse(self, response):
for item in response.json()['reviews']:
reviewer = item['name']
review = item['summary']
yield {"reviewer":reviewer,"review":review}
I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break
I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()
I've built a simple scrapy spider running on scrapinghub:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def parse(self, response):
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
The problem I am facing is that the multiple_locs_url response.css returns an empty array despite me seeing it in the markup on the browser side.
I checked with scrapy shell and scrapy shell does not see the markup. I guess this is due to the markup being rendered through javascript when the page is loaded.
I added splash but that does not seem to apply to response. How would I make scrapy wait with the query until the page is loaded?
See source code for the page: view-source:pracuj.pl/praca/polska;ct,1 .
There is no element with class "offer-regions__label" in html code.
This code will always return an empty list:
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')
But as explained here https://stackoverflow.com/a/17697329/9913319:
Many times when crawling we run into problems where content that is
rendered on the page is generated with Javascript and therefore scrapy
is unable to crawl for it.
In this case you can use Selenium.
I changed your code and checked it and it works:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def __init__( self, **kwargs ):
super().__init__( **kwargs )
profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
firefox_binary = "pathToFirefoxBinary" # Must be the developer edition!!!
# self.driver = webdriver.Firefox()
self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
def parse(self, response):
self.driver.get( response.url )
elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
self.driver.get( response.url )
for element in elements:
print( "****" )
print( str( element.get_attribute( "href" ) ) )
print( str( element.text ) )
# your old code below
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
I am using Python 3.5.2 and Scrapy 1.1.
There is a nested request in the demo below, specifically, in the page of article content, there is an ajax request which gets the author when he logs in.
I wrote it as follow, I can't get the author and I have a hard time understanding where the problem is.
demo:
# -*- coding: utf-8 -*-
import scrapy
from demo.items import ExampleItem
from scrapy.spiders import CrawlSpider
import re
class ExampleSpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/articles-list.php?page=1",
"http://www.example.com/articles-list.php?page=2",
"http://www.example.com/articles-list.php?page=3",
"http://www.example.com/articles-list.php?page=4",
"http://www.example.com/articles-list.php?page=5",
"http://www.example.com/articles-list.php?page=6",
]
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Cookie':'PHPSESSID=12345370000029b72333333dc999999; QS[uid]=100; QS[username]=example; QS[password]=example.com; QS[pmscount]=1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2774.3 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
def parse(self, response):
hrefs = response.xpath('a/#href')
for href in hrefs:
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_article_contents)
def parse_article_contents(self, response):
for sel in response.xpath('/html/body'):
item = ExampleItem()
item['articleUrl'] = response.url
item['title'] = sel.xpath('div[3]/a[2]/#href')[0].extract()
item['content'] = sel.xpath('div[2]/div[2]/div[1]/text()')[0].extract()
#In the page of artile content,there is an ajax request,which get the auther when login.
articleId = re.search(u'id=(\d{1,4})&', item['articleUrl']).group(1)
articleAuthorUrl = 'http://www.example.com/plus/ajax_author.php?id=' + articleId
#Crawling auther below.Is it correct?
def request_article_author(self):
return scrapy.Request(url=articleAuthorUrl,headers=headers,callback=self.parse_article_author)
def parse_article_author(self, response):
item['author'] = response.xpath('/html/body/div/div[1]/div[2]/text()').extract()
# item['author'] can be yielded var "yield item" below?
yield item
Any ideas?