How to navigate through js/ajax based pagination while scraping a website? - python

My code works fine only for the first page of each category, But I want to scrape from all the pages of each category. I'm not able to navigate through the next pages. The website uses AJAX for populating the data when I click on next button for navigating to next page.
I have also looked into the ajax request which is being made by this website for dynamically populating data(This is the URL which pop up on network tab when I clicked on next page button https://www.couponcodesme.com/ae/category/searchfilter). But didn't find any way to mock that request manually using Scrapy.
If it's possible to mock the ajax request please let me know how to do it for this particular problem.
You are welcome to suggest any other solution rather than Scrapy-Splash!
I have searched the whole Stack Overflow forum but didn't find a proper solution for this problem.
Please look into this and help me.
Thank You
import scrapy
from scrapy import Request
from ..items import CouponcollectItem
from scrapy_splash import SplashRequest
class Couponsite5SpiderSpider(scrapy.Spider):
name = 'couponSite5_spider'
allowed_domains = ['www.couponcodesme.com']
script = '''
function main(splash, args)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(5))
assert(splash:runjs("$('a.category_pagination_btn.next_btn.top-page-button').click()"))
assert(splash:wait(5))
return {
html = splash:html()
}
end
'''
def start_requests(self):
yield Request(
url="https://www.couponcodesme.com/ae/categories",
callback=self.parse
)
def parse(self, response):
urls = response.xpath('//ul[#class="flexboxesmain categorieslist"]/li/a/#href').extract()
for url in urls:
yield SplashRequest(
url=url,
callback=self.parse_params,
endpoint="execute",
args={
'wait': 1,
'lua_source': self.script
}
)
def parse_params(self, response):
items = CouponcollectItem()
coupon_category = response.xpath('//div[#class="head_main"]/h1[#class="h2_title"]/text()').extract()
coupon_lists = response.css('#temp1')
for coupon in coupon_lists.xpath('div'):
coupon_title = coupon.xpath('normalize-space(.//h3/a/text())').extract()
coupon_store_name = coupon.xpath('normalize-space(.//div[#class="img-vert-center setheight brdrclr"]/a/#href)').extract()
store_img_src = coupon.xpath('normalize-space(.//div[#class="img-vert-center setheight brdrclr"]/a/img/#data-src)').extract()
coupon_code_txt = coupon.xpath('normalize-space(.//span[#class="offer_code"]/span/text())').extract()
coupon_store_out = coupon.xpath('.//button/#data-alt').extract()
items['coupon_title'] = [self.deEmojify(coupon_title[0]) if len(coupon_title) != 0 else '']
items['coupon_code_txt'] = [coupon_code_txt[0] if len(coupon_code_txt) != 0 else '']
items['coupon_store_out'] = [coupon_store_out[0] if len(coupon_store_out) != 0 else '']
items['store_img_src'] = [store_img_src[0] if len(store_img_src) != 0 else '']
items['website_link'] = [response.request.url]
if len(coupon_category) != 0:
if coupon_category[0].endswith(' Coupons'):
items['coupon_category'] = [self.deEmojify(coupon_category[0][:-8])]
else:
items['coupon_category'] = [self.deEmojify(coupon_category[0])]
else:
items['coupon_category'] = ['']
if len(coupon_store_name) != 0:
if coupon_store_name[0].endswith(' Coupons'):
items['coupon_store_name'] = [self.deEmojify(coupon_store_name[0][:-8])]
elif coupon_store_name[0].startswith('https://'):
items['coupon_store_name'] = [coupon_store_name[0].split('/')[-1]]
else:
items['coupon_store_name'] = [self.deEmojify(coupon_store_name[0])]
else:
items['coupon_store_name'] = ['']
yield items
def deEmojify(self, inputString):
return inputString.encode('ascii', 'ignore').decode('ascii')

Related

Crawled (403) Error while login to glassdoor.com using scrapy in python. Need Solution?

Here is the complete code, there is error "Crawled (403)", when I run the code. If I bypass the HTTP error by using HTTPERROR_ALLOWED_CODES =[403] in setting.py, then code start working.
But I need the solution of login into the website.
import scrapy
from urllib.parse import urljoin
from scrapy.http import Request,FormRequest
class MoorSpider(scrapy.Spider):
name = 'moor'
allowed_domains = ['glassdoor.com']
start_urls = ['https://www.glassdoor.com/profile/login_input.htm']
page_number = 2
def parse(self,response):
token = response.xpath('.//*[#name="gdToken"]/#value').extract()
# print(token)
yield FormRequest('https://www.glassdoor.com/profile/ajax/loginSecureAjax.htm', formdata={'username':'likej41679#94jo.com','password':'1a2b3c4d','gdToken':token}, callback=self.startscraper)
def startscraper(self,response):
yield Request('https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page=1&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US', callback=self.startscraper1)
def startscraper1(self,response):
urls = response.css('.col-12.my-0.mt-sm.mt-sm-std.order-5 a::attr(href)').extract()
# print(next_page)
for url in urls:
url1 = urljoin('https://www.glassdoor.com/', url)
yield Request(url1, callback=self.DetailPage)
# next_page = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page='+str(MoorSpider.page_number)+'&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US'
next_page = 'https://www.glassdoor.com/Explore/browse-companies.htm?overall_rating_low=3.5&page=' + str(
MoorSpider.page_number) + '&isHiringSurge=0&locId=1282&locType=S&locName=North%20Carolina,%20US,%20US'
if MoorSpider.page_number <= 2:
MoorSpider.page_number += 1
yield response.follow(next_page, callback=self.startscraper1)
def DetailPage(self,response):
Company_Website=response.css('[data-test="employer-website"]::text').get()
Company_Revenue = response.css('[data-test="employer-revenue"]::text').get()
Company_Description = response.css('span[data-test="employerDescription"]::text').get()
Company_Mission = response.css('span[data-test="employerMission"]::text').get()
yield {
'Company_Website':Company_Website,
'Company_Revenue':Company_Revenue,
'Company_Description':Company_Description,
'Company_Mission':Company_Mission,
}
Replace your existing parse method with the following one in order for it to work. Turn out that your token variable holds nothing as it is generated dynamically. You can however parse the value of gdToken out of some script tag.
def parse(self,response):
token = response.css('body').re(r"gdToken\":\"(.*?)\",")[0]
yield FormRequest('https://www.glassdoor.com/profile/ajax/loginSecureAjax.htm', formdata={'username':'likej41679#94jo.com','password':'1a2b3c4d','gdToken':token}, callback=self.startscraper)

Splash for Scrapy only returns empty list

I hope there's someone who can help a newbie:
I try to scrape the prices of https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html using Scrapy. Since those prices are loaded dynamically with Javascript I tried to use Splash to deal with the problem. But the outcome is still the same: Empty lists for the prices ( "hotel_displayed_prices"). The other items do all receive the correct values.
On the webpage I found two ways to get to the price with CSS selector:
.price-wrap .price :: text
.premium-offer-container div::attr(data-locationid)
both ways do not seem to work... or they do both and just splash does not.
for scrapy I copied all configurations from https://github.com/scrapy-plugins/scrapy-splash into my settings file. I did also put Robotstxt_obey = False
when rendering the website in Splash 3.4.1 (browser window) it showed me the price of the hotels so normally it should work I guess.
import scrapy
from ..items import TestItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Testdata"
start_urls = ["https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 5})
def parse(self, response):
items = TestItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid)").extract()
items["hotel_names"] = str(hotel_names).split("'")[1]
items["hotel_links"] = "https://www.tripadvisor.com" + str(hotel_links).split("'")[1]
items["hotel_ids"] = int(str(hotel_ids).split("_")[1].split("'")[0])
items["hotel_displayed_price"]= hotel_displayed_price
yield items
On this line
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid").extract()
Are you missing a closing bracket on "div::attr(data-locationid" ?
I've had a look at the behaviour under scrapy, and the prices are not returned in the HTML to a request from scrapy. What you're seeing in the browser (even Splash) is not the same as what your code is seeing.
I don't know scrapy well enough to work through this, but it seems possible to get what you need with plain old requests & BeautifulSoup:
import requests
import BeautifulSoup
r = requests.get('https://www.tripadvisor.ie/Hotels-g189541-Copenhagen_Zealand-Hotels.html')
soup = BeautifulSoup(requests.content, 'lxml')
prices = [price.text for price in soup.select('.price-wrap .price')]
print(prices)
['€131', '€112', '€121', '€133', '€172', '€169', '€74', '€189', ...]
For everyone with the similar problem: Here is my solution. However I do have problems with duplicates when I run the script.
import scrapy
from ..items import HotelinfoItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Hoteldata"
start_urls = ["http://localhost:8050/render.html?url=https:"
"//www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 10})
def parse(self, response):
items = HotelinfoItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-pernight)").extract()
hotel_type = entry.css(".mb10").css(".label::text").extract()
items["hotel_names"] = [str(hotel_names).split("'")[1]]
items["hotel_links"] = ["https://www.tripadvisor.com" + str(hotel_links).split("'")[1]]
items["hotel_ids"] = [str(hotel_ids).split("_")[1].split("'")[0]]
if len(hotel_type) == 0:
items["hotel_type"] = ["Hotel"]
else:
items["hotel_type"] = hotel_type
if len(hotel_displayed_price) == 0:
items["hotel_displayed_price"] = ["NA"]
else:
items["hotel_displayed_price"] = hotel_displayed_price
yield items
next_page = response.css("a.next::attr(href)").get()
next_page_splash = "http://localhost:8050/render.html?url=https://www.tripadvisor.com" + \
str(next_page).split("#")[0] + "&timeout=10&wait=5"
if next_page is not None:
yield response.follow(next_page_splash, callback=self.parse)

TypeError('Request url must be str or unicode, got %s:' % type

I am trying to login to imdb and scrape some data.
Here is my code
import scrapy
from scrapy.http import FormRequest
class lisTopSpider(scrapy.Spider):
name= 'imdbLog'
allowed_domains = ['imdb.com']
start_urls = [
'https://www.imdb.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https://www.imdb.com/registration/ap-signin-handler/imdb_us&openid.identity=http://specs.openid.net/auth/2.0/identifier_select&openid.assoc_handle=imdb_us&openid.mode=checkid_setup&siteState=eyJvcGVuaWQuYXNzb2NfaGFuZGxlIjoiaW1kYl91cyIsInJlZGlyZWN0VG8iOiJodHRwczovL3d3dy5pbWRiLmNvbS8_cmVmXz1sb2dpbiJ9&openid.claimed_id=http://specs.openid.net/auth/2.0/identifier_select&openid.ns=http://specs.openid.net/auth/2.0&tag=imdbtag_reg-20'
]
def parse(self, response):
token = response.xpath('//form/input[#name="appActionToken"]/#value').get()
appAction = response.xpath('//form/input[#name="appAction"]/#value').get()
siteState = response.xpath('//form/input[#name="siteState"]/#value').get()
openid = response.xpath('//form/input[#name="openid.return_to"]/#value').get()
prevRID = response.xpath('//form/input[#name="prevRID"]/#value').get()
workflowState = response.xpath('//form/input[#name="workflowState"]/#value').get()
create = response.xpath('//input[#name="create"]/#value').get()
metadata1 = response.xpath('//input[#name="metadata1"]/#value').get()
base_url = 'https://www.imdb.com/lists/tt0120852'
if 'login' in response.url:
return scrapy.Request(base_url, callback = self.listParse)
else:
return scrapy.Request(response,cookies=[{
'appActionToken':token,
'appAction':appAction,
'siteState':siteState,
'openid.return_to':openid,
'prevRID':prevRID,
'workflowState':workflowState,
'email':'....#gmail.com',
'create':create,
'passwrod':'....',
'metadata1':metadata1,
}], callback=self.parse)
def listParse(self, response):
listsLinks = response.xpath('//div[2]/strong')
for link in listsLinks:
list_url = response.urljoin(link.xpath('.//a/#href').get())
yield scrapy.Request(list_url, callback=self.parse_list, meta={'list_url': list_url})
next_page_url = response.xpath('//a[#class="flat-button next-page "]/#href').get()
if next_page_url is not None:
next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(next_page_url, callback=self.listParse)
#Link of each list
def parse_list(self, response):
list_url = response.meta['list_url']
myRatings = response.xpath('//div[#class="ipl-rating-star small"]/span[2]/text()').getall()
yield{
'list': list_url,
'ratings': myRatings,
}
First I was getting no Form object found something like this so I removed FormRequest and instead used Request.
Now I am getting error "TypeError('Request url must be str or unicode, got %s:' % type(url).name"
I am sure this code is far from working yet but I need to fix this error that I don't understand why it is happening.
Power shell shows this line reference number.
}], callback=self.parse)
The problem is this part:
return scrapy.Request(response,cookies=[{
'appActionToken':token,
'appAction':appAction,
'siteState':siteState,
'openid.return_to':openid,
'prevRID':prevRID,
'workflowState':workflowState,
'email':'....#gmail.com',
'create':create,
'passwrod':'....',
'metadata1':metadata1,
}], callback=self.parse)
Your first parameter is a response object, whereas Scrapy expects a url here. If you want to make another request to the same url, you can just put return scrapy.Request(response.url,cookies=[{...}], dont_filter=True).
I highly doubt this will work though.. A FormRequest is usually the way to go when you want to login.

How to get scrapy to continue iterating through websites even with errors?

So I'm relatively new to scrapy and am trying to get a crawler that pulls hyper links for businesses on a listing page. Here is the code:
class EmailSpider(CrawlSpider):
name = "emailcrawler"
start_urls = [
'https://www.yellowpages.com/search?search_terms=Computer+Software+%26+Services&geo_location_terms=Florence%2C+KY'
# 'https://www.yellowpages.com/search?search_terms=Computers+%26+Computer+Equipment-Service+%26+fix&geo_location_terms=FL'
]
def parse(self, response):
information = response.xpath('//*[#class="info"]')
for info in information:
website = info.xpath('.//*[#class="links"]/a/#href').extract_first()
if website != "None":
request = Request(url = website, callback=self.parse_email, errback = self.handle_error,
meta={'dont_retry': True, 'dont_redirect':True, 'handle_httpstatus_list': [302]})
request.meta['data'] = {
'Website': website
}
# yield response.follow(url = website, callback = self.parse_email)
yield request
next_page_url = response.xpath('//*[#class="next ajax-page"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url, errback = self.handle_error, meta={'dont_retry': True, 'dont_redirect':True})
def parse_email(self, response):
data = response.meta.get('data')
# try:
# emails = set(re.findall(r"[a-z0-9\.\-+_]+#[a-z0-9\.\-+_]+\.com", response.text, re.I))
# except AttributeError:
# return
# data['email'] = emails
selector = Selector(response)
for found_address in selector.re('[a-zA-Z0-9._%+-]+#[a-zA-Z0-9.-]+\.com'):
# item = EmailAddressItem()
data['email_address'] = found_address
# item['url'] = response.url
yield data
def handle_error(self, failure):
self.log("Request failed: %s" % failure.request)
Before I attempted to get scrapy to follow each link, I had it just return the list of websites that it pulled which worked perfectly. It was able to request the next page after iterating through the urls on the page and then yield the results. What I am trying to do now is to get it to go to each website that it pulls, extract an email element on that website if it is found and then return back to the loop and then try another website. The problem is that when the crawler gets a response error the crawl just stops. It also seems like even if the Request was successful, that the crawler is not going to be able to return to the original iteration through the yellowpages url. It gets stuck in one of the websites that it follows and then the for loop dies. How can I get the crawler to stay its course and keep attempting to pull from the websites it scrapes while also staying within the process of iterating through each page of the listing website. To put it simply, I need to be able to go through every single page on the initial listing page no matter what request error comes about, but have the crawler pop in and out of the websites it finds and attempt to scrape data on those sites.
class EmailSpider(CrawlSpider):
name = "followwebsite"
start_urls = [
# 'https://www.manta.com/mb_35_D000B000_000/offices_and_clinics_of_medical_doctors',
# 'https://www.chess.com/home'
# 'https://webscraper.io/test-sites/e-commerce/static'
'https://www.yellowpages.com/search?search_terms=Computer+Software+%26+Services&geo_location_terms=Florence%2C+KY'
'https://www.yellowpages.com/search?search_terms=Computers+%26+Computer+Equipment-Service+%26+fix&geo_location_terms=FL'
]
def parse(self, response):
website = response.xpath('//*[#class="links"]/a/#href')
yield from response.follow_all(website, self.parse_email)
next_page_url = response.xpath('//*[#class="next ajax-page"]/#href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield Request(absolute_next_page_url, errback = self.handle_error)
def parse_email(self, response):
selector = Selector(response)
for found_address in selector.re('[a-zA-Z0-9._%+-]+#[a-zA-Z0-9.-]+\.com'):
item = EmailAddressItem()
item['email_address'] = found_address
# item['url'] = response.url
yield item
def handle_error(self, failure):
self.log("Request failed: %s" % failure.request)
Figured it out no thanks to you bums

how to scrape Tripadvisor dynamically using scrapy and python

I am trying to scrape TripAdvisor's reviews, but I cannot find the Xpath to have it dynamically go through all the pages. I tried yield and callback but the thing is I cannot find the xpath for the line that goes to the next page. I am talking about This site
Here Is my code(UPDATED):
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from scrapingtest.items import ScrapingTestingItem
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
start_urls = [
"http://www.tripadvisor.in/Hotel_Review-g297679-d300955-Reviews-Ooty_Fern_Hill_A_Sterling_Holidays_Resort-Ooty_Tamil_Nadu.html"]
output_json_dict = {}
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
items = []
i=0
for sites in sites:
item = ScrapingTestingItem()
#item['reviews'] = sel.xpath('//p[#class="partial_entry"]/text()').extract()
item['subjects'] = sel.xpath('//span[#class="noQuotes"]/text()').extract()
item['stars'] = sel.xpath('//*[#class="rate sprite-rating_s rating_s"]/img/#alt').extract()
item['names'] = sel.xpath('//*[#class="username mo"]/span/text()').extract()
items.append(item)
i+=1
sites = sel.xpath('//a[contains(text(), "Next")]/#href').extract()
if(sites and len(sites) > 0):
yield Request(url="tripadvisor.in" + sites[i], callback=self.parse)
else:
yield items
If you want to select the URL behind Next why don't you try something like this:
next_url = response.xpath('//a[contains(text(), "Next")]/#href).extract()
And then yield a Request with this URL? With this you get always the next site to scrape and do not need the line containing the numbers.
Recently I did something similar on tripadvisor and this approach worked for me. If this won't work for you update your code with the approach you are trying to see where it can be approved.
Update
And change your Request creation block to the following:
if(sites and len(sites) > 0):
for site in sites:
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
Remove the else part and yield items at the end of the loop when the method finished with every parsing.
I think it can only work if you make a list of urls you want to scrap in a .txt file.
class scrapingtestspider(Spider):
name = "scrapytesting"
allowed_domains = ["tripadvisor.in"]
base_uri = "tripadvisor.in"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()

Categories

Resources