Webcrawler multiple page iteration

Webcrawler multiple page iteration - python

I want to make the crawler go to the next page to extract data any help on what to do. I am a little lost on what to do. I tried scrapy but it is kinda complicated and bs4 is more convenient.
import bs4 as bs
import urllib.request
import pandas as pd
import re
source = urllib.request.urlopen('https://messageboards.webmd.com/').read()
soup = bs.BeautifulSoup(source,'lxml')
df = pd.DataFrame(columns = ['link'],data=[url.a.get('href') for url in soup.find_all('div',class_="link")])
lists=[]
for i in range(0,33):
link = (df.link.iloc[i])
source1 = urllib.request.urlopen(link).read()
soup1 = bs.BeautifulSoup(source1,'lxml')
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))

So it looks like you're storing hrefs in a list
for url1 in soup1.find_all('a',class_="next"):
next_link = soup1.find('a',href = True, text = re.compile("next"))
if next_link:
lists.append(link+url1.get('href'))
Now you actually have to do something with them. In this case I'm assuming you want to navigate to each href in your list.
for href in lists:
new_page = urllib.request.urlopen(href).read()
And then you can scrape whatever data you want out of new_page

I've got the same problem. Here is my code example for a page I crawled for exercise. I've chained multiple site requests to get detailed information.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from capterra.items import CapterraItem
class CapterraCatSpider(CrawlSpider):
name = 'capterra_cat'
#allowed_domains = ['http://www.capterra.com/categories']
start_urls = ['http://www.capterra.com/categories']
# rules = (
# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
# )
def parse(self, response):
#TEMP
for category in response.css('ol.browse-group-list'):
#Debug: only elements of one category
if category.css('a::text').extract_first() == 'Yoga Studio':
i = CapterraItem()
#Get link to detail page
i['cat_name'] = category.css('a::text').extract_first()
#join link to detail page with base url
i['cat_link'] = response.urljoin(category.css('a::attr(href)').extract_first())
cat_link = i['cat_link']
print cat_link
#call request to detail page and pass response to parse_details method with callback method
request = scrapy.Request(cat_link, callback=self.parse_details)
request.meta['item'] = i
yield request
def parse_details(self,response):
#Debug print
print 'DETAILS!'
#read your items from response meta
item = response.meta['item']
#iterate over listings
for detail in response.css('p.listing-description.milli'):
item['profile_link'] = response.urljoin(detail.css('a.spotlight-link::attr(href)').extract_first())
#call request to profile page to get more information for listing
request = scrapy.Request(item['profile_link'], callback=self.parse_profile)
#set your item to rquest metadata
request.meta['item'] = item
yield request
def parse_profile(self,response):
#Debug print
print 'PROFILE'
item = response.meta['item']
item['product_name'] = response.css('h1.beta.no-margin-bottom::text').extract_first()
item['who_uses_software'] = response.css('div.spotlight-target > p.epsilon > i::text').extract_first()
item['vendor_name'] = response.css('h2.spotlight-vendor-name > span::text').extract_first()
return item

Related

Navigate to new page in Scrapy with the same URL

I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png

Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()

Scrapy only going through first 5 links with next_page_url

My code seemingly only goes through the first 5 links that are requested and then stops when the 6th is requested. I have tried to use start_urls and next_page_url. Both only extract from the first 5 pages given.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(tickers)
next_page_url = "https://finviz.com/"
html = response.xpath(
'//a[#class="screener_arrow"]/#href').extract()[0]
print(html)
next_page_url += html
print(next_page_url)
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
Here is the error statement:
Any help is appreciated.
EDIT:
I have updated the code, but still seem to get errors.
import scrapy
from scrapy.crawler import CrawlerProcess
import time
from bs4 import BeautifulSoup
class finvizSpider(scrapy.Spider):
global tickers
global urlcheck
urlcheck = 1
tickers = []
name = "finviz"
start_urls = [
"https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=-change"]
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class': 'screener-link-primary'})])
second_part = good_html.find('a', {'class': 'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)
def returnTickers(self):
newTickerList= []
for lists in tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList
stock_list = finvizSpider()
process = CrawlerProcess()
process.crawl(finvizSpider)
process.start()
list2 = stock_list.returnTickers()
I get the following error when this is run.

The line if next_page_url is not None: will never be None, You need to check if html is None.
The line next_page_url += html will give you an error when html is None, so first you need to check if it's None.
If html is None, then you can't do html[0], replace extract with extract_first (I used get).
Here is the fixed code:
import scrapy
from scrapy.crawler import CrawlerProcess
import time
class FinvizSpider(scrapy.Spider):
name = "finviz"
urlcheck = 1
tickers = []
start_urls = ["https://finviz.com/screener.ashx?v=111&f=cap_small,geo_usa,sh_avgvol_o300,sh_opt_option,sh_short_low&ft=4&o=change"]
def parse(self, response):
self.tickers.append(response.xpath('//a[#class="screener-link-primary"]/text()').extract())
print(self.tickers)
next_page_url = "https://finviz.com/"
html = response.xpath('//a[#class="screener_arrow"]/#href').get()
print(html)
if html is not None:
next_page_url += html
print(next_page_url)
yield scrapy.Request(next_page_url, callback=self.parse)
def returnTickers(self):
newTickerList= []
for lists in self.tickers:
if lists:
for t in lists:
newTickerList.append(t)
return newTickerList

It looks like scrapy can only callback 5 times, so instead of callingback i would recommend to iterate over a list with all the links, you can do it with BeautifulSoup and it would be very simple.
Install
pip install BeautifulSoup4
BS4 import:
from bs4 import BeautifulSoup
Rest of code:
def parse(self, url):
raw_html = scrapy.Request(url)
good_html = BeautifulSoup(raw_html, 'html.parser')
first_part = "https://finviz.com/"
tickers.append([x.text for x in good_html.findAll('a', {'class':'screener-link-primary'})])
second_part = good_html.find('a', {'class':'screener_arrow'})['href']
# Check if there is next page
if second_part:
next_url = first_part + second_part
self.parse(next_url)

Splash for Scrapy only returns empty list

I hope there's someone who can help a newbie:
I try to scrape the prices of https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html using Scrapy. Since those prices are loaded dynamically with Javascript I tried to use Splash to deal with the problem. But the outcome is still the same: Empty lists for the prices ( "hotel_displayed_prices"). The other items do all receive the correct values.
On the webpage I found two ways to get to the price with CSS selector:
.price-wrap .price :: text
.premium-offer-container div::attr(data-locationid)
both ways do not seem to work... or they do both and just splash does not.
for scrapy I copied all configurations from https://github.com/scrapy-plugins/scrapy-splash into my settings file. I did also put Robotstxt_obey = False
when rendering the website in Splash 3.4.1 (browser window) it showed me the price of the hotels so normally it should work I guess.
import scrapy
from ..items import TestItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Testdata"
start_urls = ["https://www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 5})
def parse(self, response):
items = TestItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid)").extract()
items["hotel_names"] = str(hotel_names).split("'")[1]
items["hotel_links"] = "https://www.tripadvisor.com" + str(hotel_links).split("'")[1]
items["hotel_ids"] = int(str(hotel_ids).split("_")[1].split("'")[0])
items["hotel_displayed_price"]= hotel_displayed_price
yield items

On this line
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-locationid").extract()
Are you missing a closing bracket on "div::attr(data-locationid" ?

I've had a look at the behaviour under scrapy, and the prices are not returned in the HTML to a request from scrapy. What you're seeing in the browser (even Splash) is not the same as what your code is seeing.
I don't know scrapy well enough to work through this, but it seems possible to get what you need with plain old requests & BeautifulSoup:
import requests
import BeautifulSoup
r = requests.get('https://www.tripadvisor.ie/Hotels-g189541-Copenhagen_Zealand-Hotels.html')
soup = BeautifulSoup(requests.content, 'lxml')
prices = [price.text for price in soup.select('.price-wrap .price')]
print(prices)
['€131', '€112', '€121', '€133', '€172', '€169', '€74', '€189', ...]

For everyone with the similar problem: Here is my solution. However I do have problems with duplicates when I run the script.
import scrapy
from ..items import HotelinfoItem
from scrapy_splash import SplashRequest
class HoteldataSpider (scrapy.Spider):
name = "Hoteldata"
start_urls = ["http://localhost:8050/render.html?url=https:"
"//www.tripadvisor.com/Hotels-g189541-Copenhagen_Zealand-Hotels.html"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback=self.parse, args={"wait": 10})
def parse(self, response):
items = HotelinfoItem()
all_single_entries = response.css("div.listItem")
for entry in all_single_entries:
hotel_names = entry.css(".listing_title [target=_blank]::text").extract()
hotel_links = entry.css(".listing_title a").xpath("#href").extract()
hotel_ids = entry.css(".listing_title").css("a::attr(id)").extract()
hotel_displayed_price = entry.css(".premium_offer_container").css("div::attr(data-pernight)").extract()
hotel_type = entry.css(".mb10").css(".label::text").extract()
items["hotel_names"] = [str(hotel_names).split("'")[1]]
items["hotel_links"] = ["https://www.tripadvisor.com" + str(hotel_links).split("'")[1]]
items["hotel_ids"] = [str(hotel_ids).split("_")[1].split("'")[0]]
if len(hotel_type) == 0:
items["hotel_type"] = ["Hotel"]
else:
items["hotel_type"] = hotel_type
if len(hotel_displayed_price) == 0:
items["hotel_displayed_price"] = ["NA"]
else:
items["hotel_displayed_price"] = hotel_displayed_price
yield items
next_page = response.css("a.next::attr(href)").get()
next_page_splash = "http://localhost:8050/render.html?url=https://www.tripadvisor.com" + \
str(next_page).split("#")[0] + "&timeout=10&wait=5"
if next_page is not None:
yield response.follow(next_page_splash, callback=self.parse)

Web Scraping all Urls from a website with Scrapy and Python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

Scrapy pagination issues - new to this stuff

I am trying to make a scrapy bot that utilizes pagination but having no success...
The bot crawls through all of the links on the first page one but never goes on to the next page. I have read a ton of different threads and I cant figure this out at all. I am very new to web scraping to please feel free to hammer the crap out of my code.
import time
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http.request import Request
from tutorial.items import TutorialItem
#from scrapy_tutorial.items import ScrapyTutorialItem
class raytheonJobsPageSpider(CrawlSpider):
name = "raytheonJobsStart"
allowed_domains = ["jobs.raytheon.com"]
start_urls = [
"https://jobs.raytheon.com/search-jobs"
]
rules = ( Rule(LinkExtractor(restrict_xpaths=('//div[#class="next"]',)), callback='parse_listings',follow=True), )
def parse_start_url(self, response):
'''
Crawl start URLs
'''
return self.parse_listings(response)
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[#id="search-results-list"]/ul/*/a/#href'
).extract()
nextLink = response.xpath('//a[#class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
def parse_details(self, response):
'''
Extract data from details pages
'''
sel = Selector(response)
job = sel.xpath('//*[#id="content"]')
item = TutorialItem()
# Populate job fields
item['title'] = job.xpath('//*[#id="content"]/section[1]/div/h1/text()').extract()
jobTitle=job.xpath('//*[#id="content"]/section[1]/div/h1/text()').extract()
item['reqid'] = job.xpath('//*[#id="content"]/section[1]/div/span[1]/text()').extract()
item['location'] = job.xpath('//*[#id="content"]/section[1]/div/span[last()]/text()').extract()
item['applink'] = job.xpath('//*[#id="content"]/section[1]/div/a[2]/#href').extract()
item['description'] = job.xpath('//*[#id="content"]/section[1]/div/div').extract()
item['clearance'] = job.xpath('//*[#id="content"]/section[1]/div/*/text()').extract()
#item['page_url'] = response.url
item = self.__normalise_item(item, response.url)
time.sleep(1)
return item
def __normalise_item(self, item, base_url):
'''
Standardise and format item fields
'''
# Loop item fields to sanitise data and standardise data types
for key, value in vars(item).values()[0].iteritems():
item[key] = self.__normalise(item[key])
# Convert job URL from relative to absolute URL
#item['job_url'] = self.__to_absolute_url(base_url, item['job_url'])
return item
def __normalise(self, value):
print self,value
# Convert list to string
value = value if type(value) is not list else ' '.join(value)
# Trim leading and trailing special characters (Whitespaces, newlines, spaces, tabs, carriage returns)
value = value.strip()
return value
def __to_absolute_url(self, base_url, link):
'''
Convert relative URL to absolute URL
'''
import urlparse
link = urlparse.urljoin(base_url, link)
return link
def __to_int(self, value):
'''
Convert value to integer type
'''
try:
value = int(value)
except ValueError:
value = 0
return value
def __to_float(self, value):
'''
Convert value to float type
'''
try:
value = float(value)
except ValueError:
value = 0.0
return value

You dont need PhantomJS or Splash.
By inspecting the AJAX calls I found that they are loading jobs via AJAX calls to this URL
You can see CurrentPage parameter at the end of URL.
And the result is returned in JSON format, and all jobs are on the key named results
I created a project on my side and I created fully 100% working code for you. Here is link to that in github, just download and run it ... you dont have to do anything at all :P
Download whole working project fomr here https://github.com/mani619cash/raytheon_pagination
Basic logic is here
class RaytheonspiderSpider(CrawlSpider):
name = "raytheonJobsStart"
page = 180
ajaxURL = "https://jobs.raytheon.com/search-jobs/results?ActiveFacetID=0&RecordsPerPage=15&Distance=50&RadiusUnitType=0&Keywords=&Location=&Latitude=&Longitude=&ShowRadius=False&CustomFacetName=&FacetTerm=&FacetType=0&SearchResultsModuleName=Search+Results&SearchFiltersModuleName=Search+Filters&SortCriteria=5&SortDirection=1&SearchType=5&CategoryFacetTerm=&CategoryFacetType=&LocationFacetTerm=&LocationFacetType=&KeywordType=&LocationType=&LocationPath=&OrganizationIds=&CurrentPage="
def start_requests(self):
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)
def parse_listings(self, response):
resp = json.loads(response.body)
response = Selector(text = resp['results'])
jobs = response.xpath('//*[#id="search-results-list"]/ul/*/a/#href').extract()
if jobs:
for job_url in jobs:
job_url = "https://jobs.raytheon.com" + self.__normalise(job_url)
#job_url = self.__to_absolute_url(response.url, job_url)
yield Request(url=job_url, callback=self.parse_details)
else:
raise CloseSpider("No more pages... exiting...")
# go to next page...
self.page = self.page + 1
yield Request(self.ajaxURL + str(self.page), callback=self.parse_listings)

Change
restrict_xpaths=('//div[#class="next"]',)) to
restrict_xpaths=('//a[#class="next"]',))
If this not working then do a recursive call to parse_listings function
def parse_listings(self, response):
'''
Extract data from listing pages
'''
sel = Selector(response)
jobs = response.xpath(
'//*[#id="search-results-list"]/ul/*/a/#href'
).extract()
nextLink = response.xpath('//a[#class="next"]').extract()
print "This is just the next page link - ",nextLink
for job_url in jobs:
job_url = self.__normalise(job_url)
job_url = self.__to_absolute_url(response.url, job_url)
yield Request(job_url, callback=self.parse_details)
yield Request(pagination link here, callback=self.parse_listings)
I am on mobile so cant type code. I hope the logic i told you makes sense

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Webcrawler multiple page iteration - python

Related

Navigate to new page in Scrapy with the same URL

Scrapy only going through first 5 links with next_page_url

Splash for Scrapy only returns empty list

Web Scraping all Urls from a website with Scrapy and Python

Scrapy pagination issues - new to this stuff

Categories

Resources