I'm building a Scrapy crawler/spider for a determined website where I will send scrapy a starting url (let's call it start_urls) and it will get the response of all the urls (based on determined parameters) that are contained on it
Summarizing: It should enter into start_url and then search for the company_urlsĀ given an allow parameter in the rule and return the response of each company url (everything done with headers). My code is only parsing the start_urls, not considering the urls I want to extract for the parser. What could be wrong?
LinkExtractor
link_extractor = LinkExtractor(
allow=['/organization/'],
allow_domains=['www.scrapsite.com'],
deny_extensions=IGNORED_EXTENSIONS, # Filter *.zip, *.csv, etc (add other extensions as required)
process_value=lambda url: process_url_value(url, NAME, cleaning_regex=[company_regex]),
)
ScrapySpider
class scrapsiteSpider(CrawlSpider):
name = NAME
download_delay = 5.0
main_page = MAIN_PAGE
HEADERS = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
}
start_urls = [
f'https://www.scrapsite.com/search/companies/field/companies/company_page/{rank}'
for rank in range(75, 132, 10)
rules = [Rule(link_extractor, callback='parse', follow=True)]
headers = HEADERS
#classmethod
def start_requests(cls):
logger.info('Starting scrapsite scraping')
for url in cls.start_urls:
cls.log_counter += 1
if cls.log_counter % cls.log_divider == 0:
logger.info(f'Start request: {url}')
yield Request(url, dont_filter=True, headers=HEADERS)
#classmethod
def parse(cls, response: Response):
# CAPTURE COMPANIES
logger.info(f"#### parse PREPROCESSING company {response.url}")
logger.info(f"{response.meta}")
if company_regex.search(response.url):
logger.info(f"Company Detected: {response.url.split('/')[-1]}")
return cls.parse_item(response, AddedItem())
#classmethod
def parse_item(cls, response: Response, item: Item) -> Item:
logger.info(f"#### parse_item PREPROCESSING company {response.url}")
item.set_url(value=response.url)
item.set_source(value=cls.name)
item.set_response_data(value=response.text)
item.set_uuid(value=make_id_from_url(url=response.url))
yield item
Related
I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[#id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[#id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[#id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
I checked HTML and there is no title
'//*[#id="post-title entry-title"]/header/h1//text()'
but
'//h1[#class="post-title entry-title"]/text()'
or even simpler
'//h1[#itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[#id="in-category"]/header/p[1]//text()'
but
'//p[#class="in-category"]//a/text()'
There is no date
'//*[#id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[#class="single-date"]//span[2]/text()'
or even simpler
'//span[#itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess().
Everyone can paste all code in one file script.py and run it as python script.py without creating project.
I use max_pages = 2 to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(#class, "aom-article-simple")]'):
url = article.xpath('.//a/#href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[#class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[#itemprop="headline"]/text()').extract()
category = response.xpath('//p[#class="in-category"]//a/text()').extract()
#date = response.xpath('//p[#class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[#itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()
Ok, let's say I have a website that has listed job offers and there are multiple pages (dynamic, that why I'm, using selenium). What I want to do:
Scrape every job post URL on multiple pages,
Scrape from every URL few items (title, localization, etc.)
class JobScraper(scrapy.Spider):
name = "jobscraper"
allowed_domains = ['pracuj.pl']
total = 10
start_urls = [
'https://www.pracuj.pl/praca/it%20-%20rozw%c3%b3j%20oprogramowania;cc,5016/%c5%82%c3%b3dzkie;r,5?rd=10&pn={}'.format(i)
for i in range(1, total)
]
custom_settings = {
'LOG_LEVEL': 'INFO',
}
def __init__(self):
self.options = webdriver.ChromeOptions()
self.options.headless = True
self.driver = webdriver.Chrome(r'C:\Users\kacpe\OneDrive\Pulpit\Python\Projekty\chromedriver.exe',
options=self.options)
def parse(self, response):
self.driver.get(response.url)
res = response.replace(body=self.driver.page_source)
offers = res.xpath('//li[contains(#class, "results__list-container")]')
for offer in offers:
link = offer.xpath('.//a[#class="offer-details__title-link"]/#href').extract()
yield Request(link, callback=self.parse_page)
def parse_page(self, response):
title = response.xpath('//h1[#data-scroll-id="job-title"]/text()').extract()
yield {
'job_title': title
}
And it doesn't work, an error that occurred:
TypeError: Request url must be str or unicode, got list
You call extract in this line:
link = offer.xpath('.//a[#class="offer-details__title-link"]/#href').extract()
Extract returns a list of elements, which is why you get the error when you try to pass link to Request.
Depending what you wanted to do, you could do for link in links and Request each result, or get a specific xpath by using find_elements_by_xpath.
You don't need selenium to scrape the required content. Turns out that the items you wish to grab from that site are within some script tag. Once you scoop out that portion using regex and process it using json library, you should access them very easily. The following is how I meant:
import json
import scrapy
class JobScraper(scrapy.Spider):
name = "jobscraper"
total = 10
start_urls = [
'https://www.pracuj.pl/praca/it%20-%20rozw%c3%b3j%20oprogramowania;cc,5016/%c5%82%c3%b3dzkie;r,5?rd=10&pn={}'.format(i)
for i in range(1, total)
]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'
}
def start_requests(self):
for start_url in self.start_urls:
yield scrapy.Request(start_url,callback=self.parse,headers=self.headers)
def parse(self, response):
items = response.css("script:contains('window.__INITIAL_STATE__')::text").re_first(r"window\.__INITIAL_STATE__ =(.*);")
for item in json.loads(items)['offers']:
yield {
"title":item['jobTitle'],
"employer":item['employer'],
"country":item['countryName'],
"details_page":item['companyProfileUrl']
}
I've built a simple scrapy spider running on scrapinghub:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def parse(self, response):
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
The problem I am facing is that the multiple_locs_url response.css returns an empty array despite me seeing it in the markup on the browser side.
I checked with scrapy shell and scrapy shell does not see the markup. I guess this is due to the markup being rendered through javascript when the page is loaded.
I added splash but that does not seem to apply to response. How would I make scrapy wait with the query until the page is loaded?
See source code for the page: view-source:pracuj.pl/praca/polska;ct,1 .
There is no element with class "offer-regions__label" in html code.
This code will always return an empty list:
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)')
But as explained here https://stackoverflow.com/a/17697329/9913319:
Many times when crawling we run into problems where content that is
rendered on the page is generated with Javascript and therefore scrapy
is unable to crawl for it.
In this case you can use Selenium.
I changed your code and checked it and it works:
class ExtractionSpider(scrapy.Spider):
name = "extraction"
allowed_domains = ['domain']
start_urls = ['http://somedomainstart']
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
def __init__( self, **kwargs ):
super().__init__( **kwargs )
profile = webdriver.FirefoxProfile( "pathToFirefoxProfile" )
firefox_binary = "pathToFirefoxBinary" # Must be the developer edition!!!
# self.driver = webdriver.Firefox()
self.driver = webdriver.Firefox( profile, firefox_binary = firefox_binary )
def parse(self, response):
self.driver.get( response.url )
elements = self.driver.find_elements_by_css_selector( "a.offer-details__title-link" )
self.driver.get( response.url )
for element in elements:
print( "****" )
print( str( element.get_attribute( "href" ) ) )
print( str( element.text ) )
# your old code below
urls = response.css('a.offer-details__title-link::attr(href)').extract()
print(urls)
for url in urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
multiple_locs_urls = response.css('a.offer-regions__label::attr(href)').extract()
print(multiple_locs_urls)
for url in multiple_locs_urls:
url = response.urljoin(url)
yield SplashRequest(url=url, callback=self.parse_details)
next_page_url = response.css('li.pagination_element--next > a.pagination_trigger::attr(href)').extract_first()
if next_page_url:
next_page_url = response.urljoin(next_page_url)
yield SplashRequest(url=next_page_url, callback=self.parse)
def parse_details(self, response):
yield {
'title': response.css('#jobTitle').extract_first(),
'content': response.css('#description').extract_first(),
'datePosted': response.css('span[itemprop="datePosted"]').extract_first(),
'address': response.css('span[itemprop="address"]').extract_first()
}
I am trying to scrape twitter.
please enter search.twitter.com and put Comorbidity in search form.
I can get first page correctly, I can see when scrolling down for more tweets, the next page can be gotten from min_position param.
But when send request with next page, I can't get correct content.
Here is my some code.
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
def start_requests(self):
yield Request(url=self.start_urls[0], callback=self.parse_search_page)
def parse_search_page(self, response):
keyword = 'Comorbidity'
search_url = self.search_url.format(keyword=keyword)
yield Request(url=search_url, callback=self.parse_twitter_page, headers=self.headers)
def parse_twitter_page(self, response):
next_page = None
if self.current_page == 0:
posts = response.xpath('//li[#data-item-type="tweet"]').extract()
min_position = re.search('data-min-position="(.*?)"', response.body)
if min_position:
min_position = min_position.group(1)
next_page = self.next_page_url.format(position=min_position.replace('cm+', 'cm%2B').replace('==', '%3D%3D'))
self.current_page = 1
else:
json_data = json.loads(response.body)
min_position = json_data.get('min_position')
if next_page:
yield scrapy.http.Request(
url=self.next_page_url,
callback=self.parse_twitter_page,
headers=self.headers,
)
How can I get correct min_position?
I think you had a wrong in parse_twitter_page method.
if next_page:
yield scrapy.http.Request(
url=next_page,
callback=self.parse_twitter_page,
headers=self.headers,
)
It should not be self.next_page_url.
I changed from self.next_page_url to next_page
I hope this will works.
I am using Python 3.5.2 and Scrapy 1.1.
There is a nested request in the demo below, specifically, in the page of article content, there is an ajax request which gets the author when he logs in.
I wrote it as follow, I can't get the author and I have a hard time understanding where the problem is.
demo:
# -*- coding: utf-8 -*-
import scrapy
from demo.items import ExampleItem
from scrapy.spiders import CrawlSpider
import re
class ExampleSpider(CrawlSpider):
name = "example"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/articles-list.php?page=1",
"http://www.example.com/articles-list.php?page=2",
"http://www.example.com/articles-list.php?page=3",
"http://www.example.com/articles-list.php?page=4",
"http://www.example.com/articles-list.php?page=5",
"http://www.example.com/articles-list.php?page=6",
]
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Cookie':'PHPSESSID=12345370000029b72333333dc999999; QS[uid]=100; QS[username]=example; QS[password]=example.com; QS[pmscount]=1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2774.3 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
def parse(self, response):
hrefs = response.xpath('a/#href')
for href in hrefs:
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_article_contents)
def parse_article_contents(self, response):
for sel in response.xpath('/html/body'):
item = ExampleItem()
item['articleUrl'] = response.url
item['title'] = sel.xpath('div[3]/a[2]/#href')[0].extract()
item['content'] = sel.xpath('div[2]/div[2]/div[1]/text()')[0].extract()
#In the page of artile content,there is an ajax request,which get the auther when login.
articleId = re.search(u'id=(\d{1,4})&', item['articleUrl']).group(1)
articleAuthorUrl = 'http://www.example.com/plus/ajax_author.php?id=' + articleId
#Crawling auther below.Is it correct?
def request_article_author(self):
return scrapy.Request(url=articleAuthorUrl,headers=headers,callback=self.parse_article_author)
def parse_article_author(self, response):
item['author'] = response.xpath('/html/body/div/div[1]/div[2]/text()').extract()
# item['author'] can be yielded var "yield item" below?
yield item
Any ideas?