How to scrape element form scraped url? Scrapy - python

Ok, let's say I have a website that has listed job offers and there are multiple pages (dynamic, that why I'm, using selenium). What I want to do:
Scrape every job post URL on multiple pages,
Scrape from every URL few items (title, localization, etc.)
class JobScraper(scrapy.Spider):
name = "jobscraper"
allowed_domains = ['pracuj.pl']
total = 10
start_urls = [
'https://www.pracuj.pl/praca/it%20-%20rozw%c3%b3j%20oprogramowania;cc,5016/%c5%82%c3%b3dzkie;r,5?rd=10&pn={}'.format(i)
for i in range(1, total)
]
custom_settings = {
'LOG_LEVEL': 'INFO',
}
def __init__(self):
self.options = webdriver.ChromeOptions()
self.options.headless = True
self.driver = webdriver.Chrome(r'C:\Users\kacpe\OneDrive\Pulpit\Python\Projekty\chromedriver.exe',
options=self.options)
def parse(self, response):
self.driver.get(response.url)
res = response.replace(body=self.driver.page_source)
offers = res.xpath('//li[contains(#class, "results__list-container")]')
for offer in offers:
link = offer.xpath('.//a[#class="offer-details__title-link"]/#href').extract()
yield Request(link, callback=self.parse_page)
def parse_page(self, response):
title = response.xpath('//h1[#data-scroll-id="job-title"]/text()').extract()
yield {
'job_title': title
}
And it doesn't work, an error that occurred:
TypeError: Request url must be str or unicode, got list

You call extract in this line:
link = offer.xpath('.//a[#class="offer-details__title-link"]/#href').extract()
Extract returns a list of elements, which is why you get the error when you try to pass link to Request.
Depending what you wanted to do, you could do for link in links and Request each result, or get a specific xpath by using find_elements_by_xpath.

You don't need selenium to scrape the required content. Turns out that the items you wish to grab from that site are within some script tag. Once you scoop out that portion using regex and process it using json library, you should access them very easily. The following is how I meant:
import json
import scrapy
class JobScraper(scrapy.Spider):
name = "jobscraper"
total = 10
start_urls = [
'https://www.pracuj.pl/praca/it%20-%20rozw%c3%b3j%20oprogramowania;cc,5016/%c5%82%c3%b3dzkie;r,5?rd=10&pn={}'.format(i)
for i in range(1, total)
]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url,callback=self.parse,headers=self.headers)
def parse(self, response):
items = response.css("script:contains('window.__INITIAL_STATE__')::text").re_first(r"window\.__INITIAL_STATE__ =(.*);")
for item in json.loads(items)['offers']:
yield {
"title":item['jobTitle'],
"employer":item['employer'],
"country":item['countryName'],
"details_page":item['companyProfileUrl']
}

Related

Scrapy (with headers) is not extracting or parsing the item_urls

I'm building a Scrapy crawler/spider for a determined website where I will send scrapy a starting url (let's call it start_urls) and it will get the response of all the urls (based on determined parameters) that are contained on it
Summarizing: It should enter into start_url and then search for the company_urlsĀ given an allow parameter in the rule and return the response of each company url (everything done with headers). My code is only parsing the start_urls, not considering the urls I want to extract for the parser. What could be wrong?
LinkExtractor
link_extractor = LinkExtractor(
allow=['/organization/'],
allow_domains=['www.scrapsite.com'],
deny_extensions=IGNORED_EXTENSIONS, # Filter *.zip, *.csv, etc (add other extensions as required)
process_value=lambda url: process_url_value(url, NAME, cleaning_regex=[company_regex]),
)
ScrapySpider
class scrapsiteSpider(CrawlSpider):
name = NAME
download_delay = 5.0
main_page = MAIN_PAGE
HEADERS = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}
start_urls = [
f'https://www.scrapsite.com/search/companies/field/companies/company_page/{rank}'
for rank in range(75, 132, 10)
rules = [Rule(link_extractor, callback='parse', follow=True)]
headers = HEADERS
#classmethod
def start_requests(cls):
logger.info('Starting scrapsite scraping')
for url in cls.start_urls:
cls.log_counter += 1
if cls.log_counter % cls.log_divider == 0:
logger.info(f'Start request: {url}')
yield Request(url, dont_filter=True, headers=HEADERS)
#classmethod
def parse(cls, response: Response):
# CAPTURE COMPANIES
logger.info(f"#### parse PREPROCESSING company {response.url}")
logger.info(f"{response.meta}")
if company_regex.search(response.url):
logger.info(f"Company Detected: {response.url.split('/')[-1]}")
return cls.parse_item(response, AddedItem())
#classmethod
def parse_item(cls, response: Response, item: Item) -> Item:
logger.info(f"#### parse_item PREPROCESSING company {response.url}")
item.set_url(value=response.url)
item.set_source(value=cls.name)
item.set_response_data(value=response.text)
item.set_uuid(value=make_id_from_url(url=response.url))
yield item

Scrapy with selenium together the data of dynamic pages

I have a problem with going to next page they will go to next page but then they will again return to first page and they will give only the data of page 1 I have trying different approches but I am not successfull to solve these problem if any solution then provide me this is page link https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx
import scrapy
from selenium import webdriver
class TestSpider(scrapy.Spider):
name = 'test'
start_urls = ['https://www.ifep.ro/justice/lawyers/lawyerspanel.aspx']
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1,
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
def parse(self, response):
books = response.xpath("//div[#class='list-group']//#href").extract()
for book in books:
url = response.urljoin(book)
if url.endswith('.ro') or url.endswith('.ro/'):
continue
yield Request(url, callback=self.parse_book)
def __init__(self):
self.driver = webdriver.Chrome('C:\Program Files (x86)\chromedriver.exe')
def parse_book(self, response):
title=response.xpath("//span[#id='HeadingContent_lblTitle']//text()").get()
d1=response.xpath("//div[#class='col-md-10']//p[1]//text()").get()
d1=d1.strip()
d2=response.xpath("//div[#class='col-md-10']//p[2]//text()").get()
d2=d2.strip()
d3=response.xpath("//div[#class='col-md-10']//p[3]//span//text()").get()
d3=d3.strip()
d4=response.xpath("//div[#class='col-md-10']//p[4]//text()").get()
d4=d4.strip()
yield{
"title1":title,
"title2":d1,
"title3":d2,
"title4":d3,
"title5":d4,
}
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath("//a[#id='MainContent_PagerTop_NavNext']")
try:
next.click()
# get the data and write it to scrapy items
except:
break

Trouble outputting data with Scrapy

I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()

How to send objets from one rule to other with Scrapy

I'm trying to scrape Glassdoor company rating, and at some point, I need to send some objects from one rule to the other.
This is the main link for the search: https://www.glassdoor.com/Reviews/lisbon-reviews-SRCH_IL.0,6_IM1121.htm
I access this page on the first Rule, get some information, then I need to go to another link from this page, to enter into the reviews page following the XPath expression //a[#class='eiCell cell reviews '].
Here is the problem, how can I follow this link with the XPath expression, inside the parse_item without losing the information that I got?
class GetComentsSpider(CrawlSpider):
name = 'get_coments'
allowed_domains = ['www.glassdoor.com']
start_urls = ['http://https://www.glassdoor.com/Reviews/portugal-reviews-SRCH_IL.0,8_IN195.htm/']
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
download_delay = 0.1
rules = (
#Acess the page, get the link from each company and move to parse_item
Rule(LinkExtractor(restrict_xpaths="//div[#class=' margBotXs']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='eiCell cell reviews ']"), callback='parse_item', follow=True),
#Pagination
Rule(LinkExtractor(restrict_xpaths="//li[#class='next']/a"), follow=True),
)
def parse_item(self, response):
#get company name and rating
name = response.xpath("(//span[#class='updateBy'])[1]").get()
rating = response.xpath("//span[#class='bigRating strong margRtSm h1']/text()").get()
#Here i need to go to the link of //a[#class='eiCell cell reviews '] to get more data
#without losing the name and rating
yield {
"Name" : name,
"Rating" : rating
}
you can send to other parser usin Request(..., meta=...)
(and you don't need Rule to get url for this request)
def parse_item(self, response):
name = response.xpath("(//span[#class='updateBy'])[1]").get()
rating = response.xpath("//span[#class='bigRating strong margRtSm h1']/text()").get()
item = {
"Name" : name,
"Rating" : rating
}
url = ... #Here i need to go to the link of //a[#class='eiCell cell reviews '] to get more data
yield Request(url, callback='other_parser', meta={"item": item})
def other_parser(self, response):
item = response.meta['item']
item['other'] = ... # add values to item
yield item

How to force Scrapy to show all items instead of just the last one?

Having the following spider:
import scrapy
class ScrapeNames(scrapy.Spider):
name='final2'
start_urls = [
'https://www.trekearth.com/members/'
]
def parse(self, response):
for entry in response.xpath('//table[#class="member-table"]'):
for name in entry.xpath('.//tr[#class="row"]/td/p/a/text()|.//tr/td/p/a/text()').extract():
item['name'] = name
for photo in entry.xpath('.//tr[#class="row"]/td[6]/a/text()|.//tr[#class="row"]/td[6]/text()|.//tr/td[6]/text()|.//tr/td[6]/a/text()').extract():
item['photo'] = photo
yield item
I want to extract number of photos taken by the user and then export it to csv. However in my .csv I only have the last item in the table on this page (see screenshot bellow).
What I want is obviously to have a member name and number of photos taken for all of the users on a page. What am I doing wrong? How to fix this?
EDIT:
Possibly this is essential as well but my items.py file looks like this:
import scrapy
class FinalItem(scrapy.Item):
name = scrapy.Field()
photo = scrapy.Field()
pass
FOLLOW UP QUESTION:
I have introduced some improvements into my code which is now:
class ScrapeMovies(scrapy.Spider):
name='final2'
start_urls = [
'https://www.trekearth.com/members/'
]
def parse(self, response):
item = FinalItem()
for entry in response.xpath('//table[#class="member-table"]'):
for name in entry.xpath('.//tr[#class="row"]/td/p/a/text()|.//tr/td/p/a/text()').extract():
names = entry.xpath('.//tr[#class="row"]/td/p/a/text()|.//tr/td/p/a/text()').extract()
item['name'] = ";".join(names)
for photos in entry.xpath('.//tr[#class="row"]/td[6]/a/text()|.//tr[#class="row"]/td[6]/text()|.//tr/td[6]/text()|.//tr/td[6]/a/text()').extract():
photos = entry.xpath('.//tr[#class="row"]/td[6]/a/text()|.//tr[#class="row"]/td[6]/text()|.//tr/td[6]/text()|.//tr/td[6]/a/text()').extract()
item['photo'] = ";".join(photos)
yield item
However this created a mess in final .csv which now looks like this:
Is there a simple way to fix this?
Sample desired output in .csv bellow:
EDIT2:
My spider now:
import scrapy
from final.items import FinalItem
class ScrapeMovies(scrapy.Spider):
name='final2'
start_urls = [
'https://www.trekearth.com/members/'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
item['photos'] = row.xpath('string(./td[6])').extract_first()
yield item
Still does not yield proper result. I have empty .csv only. Updated settings.py
UPDATE
You need to have this line in your settings.py (site blocks default Scrapy user-agent):
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
and next this will work:
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
item['photos'] = row.xpath('string(./td[6])').extract_first()
yield item

Categories

Resources