cannot extract javascript using scrapy from webpage - python

I am working on extracting data from indeed.com for a data science project I am working on. Though I am able to successfully scrape various portions of the page, I am having some issues scraping items from within the JSON portion of the page.
Does anyone know how I would extract the items below from the URL ? >>> view-source:https://www.indeed.com/viewjob?jk=41abec7fde3513dc&tk=1dn0mslbr352v000&from=serp&vjs=3&advn=9434814581076032&adid=197003786&sjdu=BbcXv7z69Xez4bal0Fx7iYB6jxzlBG3p6CfmfgjyGDErM4mqXgOsfEsOF5maJ2GRnKJsHskFl8aEbb4LlD5LibXOuIs0dzzHfVCmKB00C2c43rDVhEZX_8Zmg4zqEyqG5LEfQjRfoyOhULxXHTMitWOUjMOdLRt367-ZewSzfkqUSnPzHungl7uY7NcfOFLy .
Items to be extracted below: \nPOT-Creation-Date: \nPO-Revision-Date: "jobLocation":"Arlington, TX
A sample script I am running is below
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
import boto3
class JobsSpider1(scrapy.Spider):
name = "indeed"
allowed_domains = ["indeed.com"]
start_urls = ["https://www.indeed.com/jobs?q=\"owner+operator\"+\"truck\"&l=augusta"]
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI':'me_test.json'
}
def parse(self, response):
jobs = response.xpath('//div[#class="title"]')
for job in jobs:
title = job.xpath('a//#title').extract_first()
posting_link = job.xpath('a//#href').extract_first()
posting_url = "https://indeed.com" + posting_link
yield Request(posting_url, callback=self.parse_page, meta={'title': title, 'posting_url':posting_url})
relative_next_url = response.xpath('//link[#rel="next"]/#href').extract_first()
absolute_next_url = "https://indeed.com" + relative_next_url
yield Request(absolute_next_url, callback=self.parse)
def parse_page(self, response):
posting_url = response.meta.get('posting_url')
job_title = response.meta.get('title')
#job_name= response.xpath('//*[#class="icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title"]/text()').extract_first()
job_descriptions=response.xpath('//*[#class="jobsearch-jobDescriptionText"]/ul').extract_first()
job_listing_header=response.xpath('//*[#class="jobSectionHeader"]/ul').extract_first()
posted_on_date= response.xpath('//*[#class="jobsearch-JobMetadataFooter"]/text()').extract_first()
job_location=response.xpath('//*[#class="jobsearch-InlineCompanyRating icl-u-xs-mt--xs jobsearch-DesktopStickyContainer-companyrating"]/div[3]/text()').extract_first()
yield {
'job_title':job_title,
'posting_url':posting_url,
# 'job_name':job_name,
'job_listing_header':job_listing_header,
'job_location': job_location,
'job_descriptions':job_descriptions,
'posted_on_date':posted_on_date
}

Related

How can i assign the returned Scrapy data to a Variable?

import scrapy
from scrapy.crawler import CrawlerRunner
class Livescores2(scrapy.Spider):
name = 'Home'
def start_requests(self):
yield scrapy.Request('https://www.livescores.com/football/turkey/super-lig/?tz=3&table=league-home')
def parse(self, response):
for total in response.css('td'):
yield{
'total': total.css('::text').get()
}
runner2 = CrawlerRunner()
runner2.crawl(Livescores2)
When i adjust settings like below, i can save the data as json without a problem.
runner2 = CrawlerRunner(settings = {
"FEEDS": {
"Home.json": {"format": "json", "overwrite": True},
},
})
I want to assign the returned Scrapy data to a Variable so i can work on it.
I don't want any Json data!
I tried:
import scrapy
from scrapy.crawler import CrawlerRunner
class Livescores2(scrapy.Spider):
name = 'Home'
def start_requests(self):
yield scrapy.Request('https://www.livescores.com/football/turkey/super-lig/?tz=3&table=league-home')
def parse(self, response):
for total in response.css('td'):
yield{
'total': total.css('::text').get()
}
runner2 = CrawlerRunner()
a = runner2.crawl(Livescores2)
print(a)
Result is:
<Deferred at 0x65cbfb6d0>
How can i reach the data from a variable? I develop a Android app so i don't need any Json file.
I don't know how to use "return" function on this code.
Thanks very much
You can simply create a class attribute that stores the data, and then access it once the spider has completed processing all of the requests. This isn't really the workflow that the scrapy framework targets though, and there are likely other web-scraping tools that could handle this more intuitively.
for example:
import scrapy
from scrapy.crawler import CrawlerRunner
class Livescores2(scrapy.Spider):
name = 'Home'
data = [] # data attribute
def start_requests(self):
yield scrapy.Request('https://www.livescores.com/football/turkey/super-lig/?tz=3&table=league-home')
def parse(self, response):
for total in response.css('td'):
item = {'total': total.css('::text').get()}
self.data.append(item) # append item to data list
yield item
runner2 = CrawlerRunner()
a = runner2.crawl(Livescores2)
print(Livescores2.data) # print the collected data

Extracting next page and setting a break

I'm trying to extract webpage data and wished to take the next few pages also but up to a limit, which I can alter. However, I've tested to see if I can at least extract the next few web-pages using Scrapy (As I'm trying to figure this out in Scrapy to learn it), but It only returns the items within the first page.
How do I extract the next pages while setting a limit i.e. 5 pages
For example, here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
class StatisticsItem(scrapy.Item):
ebay_div = Field(output_processor=TakeFirst())
url = Field(output_processor=TakeFirst())
class StatisticsSpider(scrapy.Spider):
name = 'ebay'
start_urls = ['https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?rt=nc&LH_BIN=1' +
'&LH_PrefLoc=2&mag=1&_sop=16']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url
)
def parse(self, response):
all_cards = response.xpath('//div[#class="s-item__wrapper clearfix"]')
for card in all_cards:
name = card.xpath('.//h3/text()').get() #get name of product
price = card.xpath('.//span[#class="s-item__price"]//text()').get() #price
product_url = card.xpath('.//a[#class="s-item__link"]//#href').get() #link to product
# now do whatever you want, append to dictionary, yield as item...
summary_data = {
"Name": name,
"Price": price,
"URL": product_url
}
data = {'summary_data': summary_data}
yield scrapy.Request(product_url, meta=data, callback=self.parse_product_details)
# get the next page
next_page_url = card.xpath('.//a[#class="pagination__next icon-link"]/#href').extract_first()
# The last page do not have a valid url and ends with '#'
if next_page_url == None or str(next_page_url).endswith("#"):
self.log("eBay products collected successfully !!!")
else:
print('\n' + '-' * 30)
print('Next page: {}'.format(next_page_url))
yield scrapy.Request(next_page_url, callback=self.parse)
def parse_product_details(self, response):
# Get the summary data
data = response.meta['summary_data']
data['location'] = response.xpath('//span[#itemprop="availableAtOrFrom"]/text()').extract_first()
yield data
process = CrawlerProcess(
settings={
'FEED_URI': 'collectible_cards.json',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(StatisticsSpider)
process.start()
You can try like this first make urls then start start_requests
start_urls = ["https://www.ebay.com/b/Collectible-Card-Games-Accessories/2536/bn_1852210?LH_BIN=1&LH_PrefLoc=2&mag=1&rt=nc&_pgn={}&_sop=16".format(i) for i in range(1,5)]

Navigate to new page in Scrapy with the same URL

I am writing a scrapy spider to scrape Rightmove, a property website. The issue I'm having is that the property search, which consists of several pages of different house listings, is all located under the same URL.
This means that the usual process of identifying the URL of the 'next' page doesn't work. Is there any way, using scrapy and not selenium (not efficient enough for the purpose) that I can navigate through the different pages? Please see my code and the source code of the relevant 'next page' button as the IMG below.
Thanks.
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = ['https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&furnishTypes=&keywords=']
def parse(self, response):
self.logger.info('This my first spider')
address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
listings = response.xpath('//h2[#class="propertyCard-title"]')
for listing in listings:
yield{
'Listing': listing.get()
}
nextPage = response.xpath('//*[#id="l-container"]/div[3]/div/div/div/div[3]/button/div/svg/use')
nextPage = nextPage.get()
pageTest = response.css('div[class=pagination-button pagination-direction pagination-direction--next] svg a::attr(href)')
pageTest = pageTest.get()
if pageTest is not None:
pageTest = response.urljoin(pageTest)
yield scrapy.Request(pageTest,callback=self.parse)
```[![enter image description here][1]][1]
[1]: https://i.stack.imgur.com/1I1J1.png
Actually, it turns out that each page has a unique identifier in the web-link. For example, attach &index = 24, this sends you to the next page.
What you need to figure out is how to include that into the request url. Some may have several pages so we increment by +24 each time to go onto the next page. However, we could increment by +24 onto infinite, therefore we use the number of page results as a way to break. It's rather sneaky to notice at first sight! but pretty easy to overcome.
Here's a scraper that can go to these next pages as requested:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
import requests
from bs4 import BeautifulSoup
links= []
for i in range(0, 480, 24):
url = f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=STATION%5E1712&maxPrice=500000&radius=0.5&sortType=10&propertyTypes=&mustHave=&dontShow=&index={i}&furnishTypes=&keywords='
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
ps1 = soup.find_all('span', {'class':'searchHeader-resultCount'})
for ps in ps1:
if int(ps.text.strip()) > i:
links.append(url)
else:
break
class ListingsItem(scrapy.Item):
address = Field(output_processor = TakeFirst())
listings = Field(output_processor = TakeFirst())
class listingsSpider(scrapy.Spider):
name = 'listings'
start_urls = links
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath('//div[#class="l-searchResults"]/div')
for sales in container:
l = ItemLoader(ListingsItem(), selector = sales)
l.add_xpath('address', '//address[#class="propertyCard-address"]/meta[#content]')
l.add_xpath('listings', '//h2[#class="propertyCard-title"]//text()[normalize-space()]')
yield l.load_item()
#self.logger.info('This my first spider')
#address = response.xpath('//*[#id="property-65695633"]/div/div/div[4]/div[1]/div[2]/a/address')
#listings = response.xpath('//h2[#class="propertyCard-title"]')
#for listing in listings:
# yield{
# 'Listing': listing.get()
# }
process = CrawlerProcess(
settings = {
'FEED_URI': 'rightmove.jl',
'FEED_FORMAT': 'jsonlines'
}
)
process.crawl(listingsSpider)
process.start()

Web Scraping all Urls from a website with Scrapy and Python

I am writing a web scraper to fetch a group of links
(located at tree.xpath('//div[#class="work_area_content"]/a/#href')
from a website and return the Title and Url of all the leafs sectioned by the leafs parent. I have two scrapers: one in python and one in Scrapy for Python. What is the purpose of callbacks in the Scrapy Request method? Should the information be in a multidimensional or single dimension list ( I believe multi-dimensional but it enhances complication)? Which of the below code is better? If the scraper code is better, how do I migrate the python code to the Scrapy code?
From what I understand from callbacks is that it passes a function's arguments to another function; however, if the callback refers to itself, the data gets overwritten and therefore lost, and you're unable to go back to the root data. Is this correct?
python:
url_storage = [ [ [ [] ] ] ]
page = requests.get('http://1.1.1.1:1234/TestSuites')
tree = html.fromstring(page.content)
urls = tree.xpath('//div[#class="work_area_content"]/a/#href').extract()
i = 0
j = 0
k = 0
for i, url in enumerate(urls):
absolute_url = "".join(['http://1.1.1.1:1234/', url])
url_storage[i][j][k].append(absolute_url)
print(url_storage)
#url_storage.insert(i, absolute_url)
page = requests.get(url_storage[i][j][k])
tree2 = html.fromstring(page.content)
urls2 = tree2.xpath('//div[#class="work_area_content"]/a/#href').extract()
for j, url2 in enumerate(urls2):
absolute_url = "".join(['http://1.1.1.1:1234/', url2])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree3 = html.fromstring(page.content)
urls3 = tree3.xpath('//div[#class="work_area_content"]/a/#href').extract()
for k, url3 in enumerate(urls3):
absolute_url = "".join(['http://1.1.1.1:1234/', url3])
url_storage[i][j][k].append(absolute_url)
page = requests.get(url_storage[i][j][k])
tree4 = html.fromstring(page.content)
urls3 = tree4.xpath('//div[#class="work_area_content"]/a/#href').extract()
title = tree4.xpath('//span[#class="page_title"]/text()').extract()
yield Request(url_storage[i][j][k], callback=self.end_page_parse_TS, meta={"Title": title, "URL": urls3 })
#yield Request(absolute_url, callback=self.end_page_parse_TC, meta={"Title": title, "URL": urls3 })
def end_page_parse_TS(self, response):
print(response.body)
url = response.meta.get('URL')
title = response.meta.get('Title')
yield{'URL': url, 'Title': title}
def end_page_parse_TC(self, response):
url = response.meta.get('URL')
title = response.meta.get('Title')
description = response.meta.get('Description')
description = response.xpath('//table[#class="wiki_table]/tbody[contains(/td/text(), "description")/parent').extract()
yield{'URL': url, 'Title': title, 'Description':description}
Scrapy:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractor import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from datablogger_scraper.items import DatabloggerScraperItem
class DatabloggerSpider(CrawlSpider):
# The name of the spider
name = "datablogger"
# The domains that are allowed (links to other domains are skipped)
allowed_domains = ['http://1.1.1.1:1234/']
# The URLs to start with
start_urls = ['http://1.1.1.1:1234/TestSuites']
# This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
rules = [
Rule(
LinkExtractor(
canonicalize=True,
unique=True
),
follow=True,
callback="parse_items"
)
]
# Method which starts the requests by visiting all URLs specified in start_urls
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, dont_filter=True)
# Method for parsing items
def parse_items(self, response):
# The list of items that are found on the particular page
items = []
# Only extract canonicalized and unique links (with respect to the current page)
links = LinkExtractor(canonicalize=True, unique=True).extract_links(response)
# Now go through all the found links
item = DatabloggerScraperItem()
item['url_from'] = response.url
for link in links:
item['url_to'] = link.url
items.append(item)
# Return all the found items
return items

scrapinghub/splash loses some data on rendering

I'm trying to create web-scraper for dynamic website. For this purpose I'm using Scrapy 1.2.1 and scrapy-splash 0.7 library.
The problem appears when using splash server, most of the times, it returns different data to scrapy. From log I can see that all pages are crawled. If I use scrapy.Request instead of SplashRequest, everything is OK (I get the same data each time).
My code:
import scrapy
import re
from scrapy_splash import SplashRequest
from scraper.items import ScraperRozetka
class RozetkaSpider(scrapy.Spider):
name = "rozetka_laptops"
start_urls = [
'http://rozetka.com.ua/notebooks/c80004/filter/producer=dell;page=1/',
]
def parse(self, response):
last_page = response.xpath('//ul[#name="paginator"]//li[last()]//#id').extract_first()
last_page_num = int(last_page[-1])
i = 1
while i <= last_page_num:
url = re.sub(r'page=\d+', r'page={}'.format(i), response.url)
i += 1
yield SplashRequest(url, self.parse_results, endpoint='render.html', args={'wait': 0.5, 'timeout': 60})
def parse_results(self, response):
items = []
records = response.css('div.g-i-tile-catalog')
for record in records:
item = ScraperRozetka()
item['title'] = record.css('img::attr(title)').extract_first()
item['price'] = record.css('div.g-price-uah::text').extract_first()
item['link'] = record.css('div.g-i-tile-i-title a::attr(href)').extract_first()
items.append(item)
return items
Would be grateful if someone helped me.
Thanks.

Categories

Resources