Using XPath with Scrapy - python

I am new to using Scrapy and is trying get all the URLs of the listings on the page using Xpath.
The first xpath works
sel.xpath('//[contains(#class, "attraction_element")]')
but the second xpath is giving an error
get_parsed_string(snode_attraction, '//[#class="property_title"]/a/#href')
What is wrong and how can we fix it?
Scrapy Code
def clean_parsed_string(string):
if len(string) > 0:
ascii_string = string
if is_ascii(ascii_string) == False:
ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
return str(ascii_string)
else:
return None
def get_parsed_string(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:
return_string = htmlparser.unescape(raw_string)
return return_string
class TripAdvisorSpider(Spider):
name = 'tripadvisor'
allowed_domains = ["tripadvisor.com"]
base_uri = "http://www.tripadvisor.com"
start_urls = [
base_uri + '/Attractions-g155032-Activities-c47-t163-Montreal_Quebec.html'
]
# Entry point for BaseSpider
def parse(self, response):
tripadvisor_items = []
sel = Selector(response)
snode_attractions = sel.xpath('//[contains(#class, "attraction_element")]')
# Build item index
for snode_attraction in snode_attractions:
print clean_parsed_string(get_parsed_string(snode_attraction, '//[#class="property_title"]/a/#href'))

Both are not valid XPath expressions, you need to add the tag names after the //. You can also use a wildcard *:
snode_attractions = sel.xpath('//*[contains(#class, "attraction_element")]')
Note that aside from that you second XPath expression that is used in a loop has to be context specific and start with a dot:
# Build item index
for snode_attraction in snode_attractions:
print clean_parsed_string(get_parsed_string(snode_attraction, './/*[#class="property_title"]/a/#href'))
Also note that you don't need to instantiate a Selector object and ca use response.xpath() shortcut directly.
Note that a more concise and, arguably, more readable version of the same logic implementation would be to use CSS selectors:
snode_attractions = response.css('.attraction_element')
for snode_attraction in snode_attractions:
print snode_attraction.css('.property_title > a::attr("href")').extract_first()

Related

How to scrape website with multiple pages using scrapy

I am trying to scrape this website (that has multiple pages), using scrapy. the problem is that I can't find the next page URL.
Do you have an idea on how to scrape a website with multiple pages (with scrapy) or how to solve the error I'm getting with my code?
I tried the code below but it's not working:
class AbcdspiderSpider(scrapy.Spider):
"""
Class docstring
"""
name = 'abcdspider'
allowed_domains = ['abcd-terroir.smartrezo.com']
alphabet = list(string.ascii_lowercase)
url = "https://abcd-terroir.smartrezo.com/n31-france/annuaireABCD.html?page=1&spe=1&anIDS=31&search="
start_urls = [url + letter for letter in alphabet]
main_url = "https://abcd-terroir.smartrezo.com/n31-france/"
crawl_datetime = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
start_time = datetime.datetime.now()
def parse(self, response):
self.crawler.stats.set_value("start_time", self.start_time)
try:
page = response.xpath('//div[#class="pageStuff"]/span/text()').get()
page_max = get_num_page(page)
for index in range(page_max):
producer_list = response.xpath('//div[#class="clearfix encart_ann"]/#onclick').getall()
for producer in producer_list:
link_producer = self.main_url + producer
yield scrapy.Request(url=link_producer, callback=self.parse_details)
next_page_url = "/annuaireABCD.html?page={}&spe=1&anIDS=31&search=".format(index)
if next_page_url is not None:
yield scrapy.Request(response.urljoin(self.main_url + next_page_url))
except Exception as e:
self.crawler.stats.set_value("error", e.args)
I am getting this error:
'error': ('range() integer end argument expected, got unicode.',)
The error is here:
page = response.xpath('//div[#class="pageStuff"]/span/text()').get()
page_max = get_num_page(page)
The range function expected an integer value (1,2,3,4, etc) not an unicode string ('Page 1 / 403'
)
My proposal for the range error is
page = response.xpath('//div[#class="pageStuff"]/span/text()').get().split('/ ')[1]
for index in range(int(page)):
#your actions

how to yield a parsed item from one link with other parsed items from other links in the same item list

The problem is that I've been iterating from a list of places to scrape the latitude longitude and elevation. The thing is when I get what I scraped back I have no way to link it with my current df since the names that I iterated may have either been modified or skipped.
I've managed to get the name of what I looked but since its parsed from an outside the link from the rest of the items it doesn't work properly.
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider):
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
#output
appellation base_name elevation latitude longitude
Chalone, USA
Santa Cruz, USA 56.81 35 9.23
what is happening is that I parse what I looked for then it goes inside a link and parses the rest of the information. However, evidently on my dataframe I get the name of what I looked for completely unattached with the rest of the items and even then is hard to find the match. I wish to pass the info to the other function so it yields all the items all together.
This may work. I will comment both what I am doing and a little bit of your code you have an understanding of what I am doing.
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
items = latlonglocItem()
items['base_name'] = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
self.base_name = items['base_name'] # Lets store the base_name in the class
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
yield response.follow(href, self.parse_distancesto)
else:
pass
yield items
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
base_name = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
import scrapy
import pandas as pd
from ..items import latlonglocItem
df = pd.read_csv('wine_df_final.csv')
df = df[pd.notnull(df.real_place)]
real_place = list(set(df.real_place))
class latlonglocSpider(scrapy.Spider): # latlonglocSpider is a child class of scrapy.Spider
name = 'latlonglocs'
start_urls = []
for place in real_place:
baseurl = place.replace(',', '').replace(' ', '+')
cleaned_href = f'http://www.google.com/search?q={baseurl}+coordinates+latitude+longitude+distancesto'
start_urls.append(cleaned_href)
def __init__(self): # Constructor for our class
# Since we did our own constructor we need to call the parents constructor
scrapy.Spider.__init__(self)
self.base_name = None # Here is the base_name we can now use class wide
def parse(self, response):
for href in response.xpath('//*[#id="ires"]/ol/div/h3/a/#href').getall():
if href.startswith('/url?q=https://www.distancesto'):
self.base_name = response.xpath('string(/html/head/title)').get().split(' coordinates')[0]
yield response.follow(href, self.parse_distancesto)
else:
pass
def parse_distancesto(self, response):
items = latlonglocItem()
try:
# If for some reason self.base_name is never assigned in
# parse() then we want to use an empty string instead of the self.base_name
# The following syntax means use self.base_name unless it is None or empty
# in which case just use and empty string.
items['base_name'] = self.base_name or "" # If for some reason
items['appellation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[2]/p/strong)').get()
items['latitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[1]/td)').get()
items['longitude'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[2]/td)').get()
items['elevation'] = response.xpath('string(/html/body/div[3]/div/div[2]/div[3]/div[3]/table/tbody/tr[10]/td)').get()
yield items
except Exception:
pass
thanks to Error - Syntactical Remorse. Concurrent requests had to set to 1 for it to work and placed base_name inside the loop.

Scrapy iterating over list of elements on page

I'm having issues with my scrapy project. I want to extract all adds on the page in a list and then iterate over that list to extract and save data for every add. I'm sure I'm doing something terribly wrong and yet I don't know what. I suspect the problem is with the .extract_first() command but I'm calling that on a single object in the list not the whole response. As of right now the spider is only extracting the first data that conforms to the xpath that it finds on the page.
Here is the code:
class OddajastanovanjeljmestoSpider(scrapy.Spider):
name = 'OddajaStanovanjeLjMesto'
allowed_domains = ['www.nepremicnine.net']
start_urls = ['https://www.nepremicnine.net/oglasi-oddaja/ljubljana-mesto/stanovanje/']
def parse(self, response):
oglasi = response.xpath('//div[#itemprop="item"]')
for oglas in oglasi:
item = NepremicninenetItem()
item['velikost'] = oglas.xpath('//div[#class="main-data"]/span[#class="velikost"]/text()').extract_first(default="NaN")
item['leto'] = oglas.xpath('//div[#class="atributi"]/span[#class="atribut leto"]/strong/text()').extract_first(default="NaN")
item['zemljisce'] = oglas.xpath('//div[#class="atributi"]/span[#class="atribut"][text()="Zemljišče: "]/strong/text()').extract_first(default="NaN")
request = scrapy.Request("https://www.nepremicnine.net" + response.xpath('//div[#itemprop="item"]/h2[#itemprop="name"]/a[#itemprop="url"]/#href').extract_first(), callback=self.parse_item_page)
request.meta['item'] = item
yield request
next_page_url = response.xpath('//div[#id="pagination"]//a[#class="next"]/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_item_page(self, response):
item = response.meta['item']
item['referencnaStevilka'] = response.xpath('//div[#id="opis"]/div[#class="dsc"][preceding-sibling::div[#class="lbl"][text()="Referenčna št.:"]]/strong/text()').extract_first(default="NaN")
item['tipOglasa'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="1"]]/#title').extract_first(default="NaN")
item['cena'] = response.xpath('//div[#class="galerija-container"]/meta[#itemprop="price"]/#content').extract_first(default="NaN")
item['valuta'] = response.xpath('//div[#class="galerija-container"]/meta[#itemprop="priceCurrency"]/#content').extract_first(default="NaN")
item['vrstaNepremicnine'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="5"]]/#title').extract_first(default="NaN")
item['tipNepremicnine'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="6"]]/#title').extract_first(default="NaN")
item['regija'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="2"]]/#title').extract_first(default="NaN")
item['upravnaEnota'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="3"]]/#title').extract_first(default="NaN")
item['obcina'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="4"]]/#title').extract_first(default="NaN")
item['prodajalec'] = response.xpath('//div[#itemprop="seller"]/meta[#itemprop="name"]/#content').extract_first(default="NaN")
yield item
the parse_item_page method works correctly and returns the appropriate data but the parse method just returns the first data that it sees on the page...
Looks like the issue is with your xpath expressions. It looks like you need relative xpath expression inside the iteration which mean they need to start with a "."
item['velikost'] = oglas.xpath(
'.//div[#class="maindata"]/span[#class="velikost"]/text()'
).extract_first(default="NaN")
item['leto'] = oglas.xpath(
'.//div[#class="atributi"]/span[#class="atribut leto"]/strong/text()'
).extract_first(default="NaN")
If you paste a sample HTML code block I might be able to confirm.

Scrapy - ValueError: Missing scheme in request url: #mw-head

I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated
Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]
row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()

Python Scrapy Encoding utf-8

I have written a program to scrap some data from the web as below.
import scrapy
class JPItem(scrapy.Item):
question_content = scrapy.Field()
best_answer = scrapy.Field()
class JPSpider(scrapy.Spider):
name = "jp"
allowed_domains = ['chiebukuro.yahoo.co.jp']
def start_requests(self):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year=2004&month=1&day=1&page=1'
yield scrapy.Request(url, self.parse)
def parse(self, response):
if str(response.css("div.qa-list small::text").extract()) == '条件に一致する質問はみつかりませんでした。':
for y in range (2004,2007):
for m in range (1,13):
for d in range(1,32):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year='+ str(y) + '&month=' + str(m) + '&day=' + str(d) +'&page=1';
yield scrapy.Request(url, self.parse)
else:
for i in range(0,40):
url = response.xpath('//ul[#id="qalst"]/li/dl/dt/a/#href')[i].extract()
yield scrapy.Request(url, self.parse_info)
next_page = response.css("div.qa-list p.flip a.next::attr(href)").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, self.parse)
def parse_info(self, response):
item = JPItem()
item['question_content'] = "\"" + ''.join(response.css("div.mdPstdQstn div.ptsQes p:not([class])").extract() + response.css("div.mdPstdQstn div.ptsQes p.queTxt::text").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","").replace("<br>","") + "\""
item['best_answer'] = "\"" + ''.join(response.css("div.mdPstdBA div.ptsQes p.queTxt::text").extract() + response.css("div.mdPstdBA div.ptsQes p:not([class])").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","") + "\""
yield item
I found that there should be a problem with this line
if str(response.css("div.qa-list small::text").extract()) ==
'条件に一致する質問はみつかりませんでした。':
since when I run the program it cannot detect this condition, even if the extracted test should be the equal as stated, it will just skip to the Else condition. I have tried to use .encode("utf-8") but it seems could not solve the issue. Would anyone can help to provide some suggestions on this issue?
Greatly appreciated.
As #paul trmbth pointed out, what you are trying to do here is a compare a list with a string, which is logically incorrect and would always return False. So the options presented are to compare the string with :
response.css("div.qa-list small::text").extract_first() which gives the first extracted element, (here, a string) which is the preferred way since using extract_first() avoids an IndexError and returns None when it doesn’t find any element matching the selection
Since extract() returns a list, just doing response.css("div.qa-list small::text").extract[0] will work and provide the 1st element.
And incase you got a list of more than one strings and you want to take all the text together and do some operation with it, a simple method to turn all of them to a single string is to do ''.join(response.css("div.qa-list small::text"))
In your case using the 1st method is apt, and need not worry about utf-8 conversions as python will handle those internally.

Categories

Resources