Scrapy Selenium: extracts first link then throws error - python

I'm working on a scaper that collects property information.
The original code works perfectly.
URL = "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/{}"
class huntsmanCSS(scrapy.Spider):
name = "huntsman"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('parcel_ids.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def parse(self, response):
yield {
'propId': response.css('#dnn_ctr388_View_tdPropertyID::text').extract_first(),
'address': response.css('#dnn_ctr388_View_tdPropertyAddress::text').extract_first(),
'owner': response.css('#dnn_ctr388_View_divOwnersLabel::text').extract_first(),
'propertyClass': response.css('#dnn_ctr388_View_tdGIPropertyClass::text').extract_first(),
'hood': response.css('#dnn_ctr388_View_tdGINeighborhood::text').extract_first(),
'buildType': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(3)::text').extract_first(),
'improveType': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(4)::text').extract_first(),
'yrBuilt': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(5)::text').extract_first(),
'saleDate': response.css('#dnn_ctr388_View_tblSalesHistoryData tr:nth-child(2) > td:nth-child(1)::text').extract_first(),
'TAV': response.css('#dnn_ctr388_View_tdPropertyValueHeader::text').extract_first(),
'price': response.css('#dnn_ctr388_View_tblSalesHistoryData > tr:nth-child(2) > td:nth-child(5)::text').extract_first(),
'sqFt': response.css('#resImprovementTable0 > tr:nth-child(2) > td:nth-child(6)::text').extract_first()
}
Using a list of all parcels, it adjusts the URL to go to the next page.
Broken Code:
There is a link to a pdf that is embedded in a javascript button. The pdf contains more information that I want to scrape.
It will retrieve the first link but then throws errors.
URL = "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/{}"
class resDatasheetLink(scrapy.Spider):
name = "resDatasheetLink"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('residential.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
while True:
try:
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="btnDataSheet"]')))
btn.click()
except TimeoutException:
break
time.sleep(5)
link = self.driver.current_url
self.driver.close()
yield {
'datasheet': link
}
Error:
2021-12-30 10:40:36 [scrapy.core.engine] DEBUG:
Crawled (200) <GET
https://orion.lancaster.ne.gov/Property-
Detail/PropertyQuickRefID/R402438> (referer: None)
2021-12-30 10:40:36
[selenium.webdriver.remote.remote_connection]
DEBUG: POST
http://localhost:19113/session/5acb1d8f4ebdb13482ab40a67f846d1d/url {"url": "https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/R402438"}
2021-12-30 10:40:36 [urllib3.connectionpool] DEBUG: http://localhost:19113 "POST /session/5acb1d8f4ebdb13482ab40a67f846d1d/url HTTP/1.1" 404 878
2021-12-30 10:40:36 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
2021-12-30 10:40:36 [scrapy.core.scraper] ERROR: Spider error processing <GET https://orion.lancaster.ne.gov/Property-Detail/PropertyQuickRefID/R402438> (referer: None)
Traceback (most recent call last):
selenium.common.exceptions.InvalidSessionIdException: Message: invalid session id

break will take you out of the while loop. You need to unindent the last few lines just below try-except{} and invoke self.driver.close() (preferably self.driver.quit()) line at the end of parsing as follows:
def parse(self, response):
self.driver.get(response.url)
while True:
try:
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="btnDataSheet"]')))
btn.click()
except TimeoutException:
break
time.sleep(5)
link = self.driver.current_url
yield {
'datasheet': link
}
self.driver.close()

based on the way the spider is configured the loop was the issue.
class rDataLink(scrapy.Spider):
name = "rDataLink"
allowed_domains = ["orion.lancaster.ne.gov"]
f = open('residential.txt')
start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
def __init__(self):
self.driver = webdriver.Chrome()
def parse(self, response):
self.driver.get(response.url)
btn = WebDriverWait(self.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="btnDataSheet"]')))
btn.click()
WebDriverWait(self.driver, 7).until(EC.url_changes(response.url))
link = self.driver.current_url
yield {
'datasheet': link
}

Related

Scrapy: Debug Redirecting (301)

Before I was getting the error "HTTP status code is not handled or not allowed", I modified the USER_AGENT that was in default mode and now I am getting this error:
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
items = response.xpath(
'//div[contains(#class,"section_OLXad-list")]//li[contains'
'(#class,"item")]'
)
for item in items:
url = item.xpath(
".//a[contains(#class,'OLXad-list-link')]/#href"
).extract_first()
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath(
'//li[contains(#class,"item next")]//a/#href'
).extract_first()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath(
'//div[contains(#class,"photos")]//a/#href'
).extract()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath(
'normalize-space(//h1[contains(#id,"ad_title")]//.)'
).extract_first()
item['price'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-price")]'
'//span[contains(#class,"actual-price")]//.)'
).extract_first()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item
trying to execute the .py file in the terminal, I get the following message:
2022-01-13 12:36:36 [scrapy.core.engine] INFO: Spider opened
2022-01-13 12:36:36 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-01-13 12:36:36 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/robots.txt> from <GET http://pe.olx.com.br/robots.txt>
2022-01-13 12:36:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://pe.olx.com.br/robots.txt> (referer: None)
2022-01-13 12:36:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://pe.olx.com.br/imoveis/aluguel> from <GET http://pe.olx.com.br/imoveis/aluguel>
Does anyone know what might be causing this problem?
P.s.: I have tried these solutions Python Scrapy 301 redirects
It's just redirected from http to https so there's no problem there.
Your xpath is completely wrong. I fixed it in parse, and I fixed 3 xpaths in parse_detail as an example, but you need to fix the rest of them.
import scrapy
class OlxSpider(scrapy.Spider):
name = "olx"
allowed_domains = ["pe.olx.com.br"]
start_urls = (
'http://pe.olx.com.br/imoveis/aluguel',
)
def parse(self, response):
# from scrapy.shell import inspect_response
# inspect_response(response, self)
items = response.xpath('//ul[#id="ad-list"]/li')
for item in items:
url = item.xpath('.//a/#href').get()
if url:
yield scrapy.Request(url=url, callback=self.parse_detail)
next_page = response.xpath('//a[#data-lurker-detail="next_page"]/#href').get()
if next_page:
self.log('Next Page: {0}'.format(next_page))
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_detail(self, response):
self.log(u'Imóvel URL: {0}'.format(response.url))
item = {}
item['photos'] = response.xpath('//img[#class="image "]/#src').get()
item['url'] = response.url
item['address'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-location")]'
'//.)'
).extract_first()
item['title'] = response.xpath('//h1/text()').get()
item['price'] = response.xpath('//h2/text()').get()
item['details'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-description")]'
'//.)'
).extract_first()
item['source_id'] = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-id")]//strong//.)'
).extract_first()
date = response.xpath(
'normalize-space(//div[contains(#class,"OLXad-date")]//.)'
).re("Inserido em: (.*).")
item['date'] = (date and date[0]) or ''
yield item

when I run command 'scrapy crawl Admission' on the error occoured is Filtered offsite request to 'www.worldomets.info'

import scrapy
class AdmissionsSpider(scrapy.Spider):
name = 'Admissions'
allowed_domains = ["www.worldometers.info"]
start_urls = ['https://www.worldometers.info/population/countries-in-asia-by-population/']
def parse(self, response):
countries=response.xpath("//td/a")
for country in countries:
name=country.xpath(".//text()").get()
links=country.xpath(".//#href").get()
absolute_url=f"https://www.worldomets.info{links}"
yield scrapy.Request(url=absolute_url)
I am traying to print countries name but it show mw the error Filtered offsite request to 'www.worldomets.info': <GET https://www.worldomets.info/world-population/china-population/>
You can try it like is
import scrapy
class AdmissionsSpider(scrapy.Spider):
name = 'Admissions'
allowed_domains = ["worldometers.info"]
start_urls = ['https://www.worldometers.info/population/countries-in-asia-by-population/']
def parse(self, response):
countries=response.xpath("//td/a")
for country in countries:
name=country.xpath(".//text()").get()
link=country.xpath(".//#href").get()
link = response.urljoin(link)
# print(link)
# absolute_url=f"https://www.worldomets.info{links}"
yield scrapy.Request(url=link, callback=self.parse_absoluteurl)
def parse_absoluteurl(self, response):
print('\n', response.url ,'\n')
for the details Following hyperlink and "Filtered offsite request"

Scrapy-crawled-200 Referer-None

I'm trying to learn how to use scrapy and python but I'm not an expert at all...
I have an empty file after crawling this page :
so.news.com
and I don't understand why...
Here is my code :
import scrapy
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/']
def parse(self, response):
#titles = response.css('#newsCon > div.newsList > div.news > h2 > a::text').extract()
#date = response.css('#newsCon > div.newsList > div.news> div > p.newstime > span::text').extract()
titles = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/h2/a/text()").extract()
date = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='resultList']/div[#class='newsListCnt secondlist']/div[#id='newsCon']/div[#class='newsList']/div[#class='news']/div[#class='easynews']/p[#class='newstime']/span/text()").extract()
for item in zip(titles,date):
scraped_info ={
"title" : item[0],
"date" : item[1],
}
yield scraped_info
nextPg = response.xpath("/html/body/div[#id='search-result']/div[#class='resultCnt']/div[#id='pagination']/a[#class='next']/#href").extract()
if nextPg is not None:
print(nextPg)
This is the messenage in console:
2020-05-11 00:09:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET http://so.news.cn/?keyWordAll=&keyWordOne=%E6%96%B0%E5%86%A0+%E8%82%BA%E7%82%8E+%E6%AD%A6%E6%B1%89+%E7%97%85%E6%AF%92&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn#search/0/%E6%96%B0%E5%86%A0/1/> (referer: None)
[]
You need always check page's source code (Ctrl+U) in your browser. Content you see in your browser maybe loaded using XHR Javascript call. Here is code that works for me (I found correct start url using Chrome Developer Console):
import scrapy
import json
import re
class XinhuaSpider(scrapy.Spider):
name = 'xinhua'
# allowed_domains = ['xinhuanet.com']
start_urls = ['http://so.news.cn/getNews?keyWordAll=&keyWordOne=%25E6%2596%25B0%25E5%2586%25A0%2B%25E8%2582%25BA%25E7%2582%258E%2B%25E6%25AD%25A6%25E6%25B1%2589%2B%25E7%2597%2585%25E6%25AF%2592&keyWordIg=&searchFields=1&sortField=0&url=&senSearch=1&lang=cn&keyword=%E6%96%B0%E5%86%A0&curPage=1']
def parse(self, response):
data = json.loads(response.body)
for item in data["content"]["results"]:
scraped_info ={
"title" : item['title'],
"date" : item['pubtime'],
}
yield scraped_info
current_page = data['content']['curPage']
total_pages = data['content']['pageCount']
if current_page < total_pages:
next_page = re.sub(r'curPage=\d+', f"curPage={current_page + 1}", response.url)
yield scrapy.Request(
url=next_page,
callback=self.parse,
)

Scrapy doesn't work for turning all the pages

I want to crawl the whole product category, but it seems that it works well to some point and than it stops.
Here is my code:
import scrapy
from Demo.items import DemoItem
class ProductSpider(scrapy.Spider):
name='black1'
start_urls = [ 'https://octopart.com/search?category_ids=4215&start=0' ]
def parse(self,response):
items = DemoItem()
for product in response.xpath("//div[#class='serp-card-header media']/div[#class='media-body']"):
name = product.xpath(".//a/span[#class='part-card-manufacturer']/text()").extract()
ver = product.xpath(".//a/span[#class='part-card-mpn']/text()").extract()
items['product_name'] = ''.join(name).strip()
items['product_code'] = ''.join(ver).strip()
yield items
next_page = response.xpath("//a[contains(text(), 'Next')]/#href").extract_first()
print next_page
if next_page is not None:
print next_page
next_page_link = response.urljoin(next_page)
print next_page_link
yield scrapy.Request(url=next_page_link, callback=self.parse)
And the outcome:
https://octopart.com/search?category_ids=4215&start=200
2019-03-06 13:51:46 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://octopart.com/search?category_ids=4215&start=200> (referer: https://octopart.com/search?category_ids=4215&start=190)
2019-03-06 13:51:46 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://octopart.com/search?category_ids=4215&start=200>: HTTP status code is not handled or not allowed

Incomplete urls in scrapy

I am getting results such as:
[crawler] DEBUG: Crawled (200) <GET http://www.hormelfoods.com/About/Legal/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/link.aspx?_id=EFFFBF3348524C6ABCD1C2775E7FDA93&_z=z> (referer: http://www.hormelfoods.com/About/Legal/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/~/link.aspx?_id=3FC7ECFD861B4F1AAF7BFD218236F983&_z=z)
I saw the page source of the referer:
It shows this <a href="~/link.aspx?_id=EFFFBF3348524C6ABCD1C2775E7FDA93&_z=z">
How to rectify this?
I have added a counter which keeps track of the number of pdfs parsed.
My parse_item function:
def parse_item(self, response):
sel = HtmlXPathSelector(response)
for utype in self.url_types:
links = []
# if sel.select('/html/head/link[#type="application/pdf"]/#href').extract():
# links += sel.select('/html/head/link[#type="application/pdf"]/#href').extract()
if sel.xpath('//a[contains(#href, "{0}")]/#href'.format(utype)).extract():
links += sel.xpath('//a[contains(#href, "{0}")]/#href'.format(utype)).extract()
if sel.xpath('/html/head/link[#type="application/{0}"]/#href'.format(utype)).extract():
links += sel.xpath('/html/head/link[#type="application/{0}"]/#href'.format(utype)).extract()
# if sel.select('/html/head/link[#type="application/x-pdf"]/#href').extract():
# links += sel.select('/html/head/link[#type="application/x-pdf"]/#href').extract()
items = []
self.cntr += len(links)
if(self.cntr > 60):
raise CloseSpider('links exceeded')
for link in links:
item = CrawlerItem()
item['main'] = response.url
base_url = get_base_url(response)
item['url'] = urljoin(base_url,link)
company = tldextract.extract(base_url)
item['source'] = company.domain
item['type'] = utype.upper()
yield item
def process_links(self,links):
for i, w in enumerate(links):
w.url = w.url.replace("../", "")
links[i] = w
return links

Categories

Resources