I have written a program to scrap some data from the web as below.
import scrapy
class JPItem(scrapy.Item):
question_content = scrapy.Field()
best_answer = scrapy.Field()
class JPSpider(scrapy.Spider):
name = "jp"
allowed_domains = ['chiebukuro.yahoo.co.jp']
def start_requests(self):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year=2004&month=1&day=1&page=1'
yield scrapy.Request(url, self.parse)
def parse(self, response):
if str(response.css("div.qa-list small::text").extract()) == '条件に一致する質問はみつかりませんでした。':
for y in range (2004,2007):
for m in range (1,13):
for d in range(1,32):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year='+ str(y) + '&month=' + str(m) + '&day=' + str(d) +'&page=1';
yield scrapy.Request(url, self.parse)
else:
for i in range(0,40):
url = response.xpath('//ul[#id="qalst"]/li/dl/dt/a/#href')[i].extract()
yield scrapy.Request(url, self.parse_info)
next_page = response.css("div.qa-list p.flip a.next::attr(href)").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, self.parse)
def parse_info(self, response):
item = JPItem()
item['question_content'] = "\"" + ''.join(response.css("div.mdPstdQstn div.ptsQes p:not([class])").extract() + response.css("div.mdPstdQstn div.ptsQes p.queTxt::text").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","").replace("<br>","") + "\""
item['best_answer'] = "\"" + ''.join(response.css("div.mdPstdBA div.ptsQes p.queTxt::text").extract() + response.css("div.mdPstdBA div.ptsQes p:not([class])").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","") + "\""
yield item
I found that there should be a problem with this line
if str(response.css("div.qa-list small::text").extract()) ==
'条件に一致する質問はみつかりませんでした。':
since when I run the program it cannot detect this condition, even if the extracted test should be the equal as stated, it will just skip to the Else condition. I have tried to use .encode("utf-8") but it seems could not solve the issue. Would anyone can help to provide some suggestions on this issue?
Greatly appreciated.
As #paul trmbth pointed out, what you are trying to do here is a compare a list with a string, which is logically incorrect and would always return False. So the options presented are to compare the string with :
response.css("div.qa-list small::text").extract_first() which gives the first extracted element, (here, a string) which is the preferred way since using extract_first() avoids an IndexError and returns None when it doesn’t find any element matching the selection
Since extract() returns a list, just doing response.css("div.qa-list small::text").extract[0] will work and provide the 1st element.
And incase you got a list of more than one strings and you want to take all the text together and do some operation with it, a simple method to turn all of them to a single string is to do ''.join(response.css("div.qa-list small::text"))
In your case using the 1st method is apt, and need not worry about utf-8 conversions as python will handle those internally.
Related
I am trying to build an item from many parsing functions because am getting data from multiple urls,
I try to iterate a dictionary (that i built using 2 for loops) that's why am using 2 for loops to get the needed variable to generate the URL
then for every variable i call the second parse function passing the needed URL
this is where i want to call the second parse function from my main parse
for r in [1,2]:
for t in [1,2]:
dataName = 'lane'+str(r)+"Player"+str(t)+"Name"
dataHolder = 'lane'+str(r)+"Player"+str(t)
nameP = item[dataName]
print('before parse ==> lane = ' + str(r) + " team = " + str(t))
urlP = 'https://www.leagueofgraphs.com/summoner/euw/'+nameP+'#championsData-soloqueue'
yield Request( urlP, callback=self.parsePlayer , meta={'item': item , "player" : dataHolder} )
I am using those prints() to see in output how my code is executing
same in my second parsing function which is as following
def parsePlayer( self , response ):
item = response.meta['item']
player = response.meta['player']
print('after parse ====> ' + player)
mmr = response.css('.rank .topRankPercentage::text').extract_first().strip().lower()
mmrP = player+"Mmr"
item[mmrP] = mmr
# yield item after the last iteration
( i know i did not explain every detail in the code but i think its not needed to see my problem , not after u see what am getting from those prints )
result i get
expected result
also for some reason everytime i run the spyder i get diffrent random order of prints this is confusing i think it s something about the yield i hope someone can help me with that
Scrapy works asynchronously (as explained clearly in their official documentation), which is why the order of your prints seem random.
Besides the order, the expected output looks exactly the same as the result you get.
If you can explain why the order is relevant, we might be able to answer your question better.
If you want to yield 1 item with data of all 4 players in there, the following structure can be used:
def start_requests(self):
# prepare the urls & players:
urls_dataHolders = []
for r in [1, 2]:
for t in [1, 2]:
dataName = 'lane' + str(r) + "Player" + str(t) + "Name"
dataHolder = 'lane' + str(r) + "Player" + str(t)
urlP = 'https://www.leagueofgraphs.com/summoner/euw/' + dataName\
+ '#championsData-soloqueue'
urls_dataHolders.append((urlP, dataHolder))
# get the first url & dataholder
url, dataHolder = urls_dataHolders.pop()
yield Request(url,
callback=self.parsePlayer,
meta={'urls_dataHolders': urls_dataHolders,
'player': dataHolder})
def parsePlayer(self, response):
item = response.meta.get('item', {})
urls_dataHolders = response.meta['urls_dataHolders']
player = response.meta['player']
mmr = response.css(
'.rank .topRankPercentage::text').extract_first().strip().lower()
mmrP = player + "Mmr"
item[mmrP] = mmr
try:
url, dataHolder = urls_dataHolders.pop()
except IndexError:
# list of urls is empty, so we yield the item
yield item
else:
# still urls to go through
yield Request(url,
callback=self.parsePlayer,
meta={'urls_dataHolders': urls_dataHolders,
'item': item,
'player': dataHolder})
I use Scrapy to get data from an API call but the server is laggy.
First I scrape one page to get some IDs, and I add them to a list.
After that, I check how many IDs I have, and I start scraping.
The max IDs I can add is 10: event_id=1,2,3,4,5,6,7,8,9,10.
The problem is, because there are many IDs like 150, I have to make many requests, and the server responds after 3-5 seconds. I want to request all links at once and parse them later if this is possible.
match = "https://api.---.com/v1/?token=???&event_id&event_id="
class ApiSpider(scrapy.Spider):
name = 'api'
allowed_domains = ['api.---.com']
start_urls = ['https://api.---.com/ids/&token=???']
def parse(self, response):
data = json.loads(response.body)
results = (data['results'])
for result in results:
id_list.append(result['id'])
yield from self.scrape_start()
def scrape_start(self):
if len(matches_id) >= 10:
qq = (
match + id_list[0] + "," + id_list[1] + "," + id_list[2] + "," + id_list[3] + "," +
id_list[4] + "," + id_list[
5] + "," + id_list[6] + "," + id_list[7] + "," + id_list[8] + "," + id_list[9])
yield scrapy.Request(qq, callback=self.parse_product)
del matches_id[0:10]
elif len(matches_id) == 9:
...
def parse_product(self, response):
data = (json.loads(response.body))
results = (data['results'])
for result in results:
...
try changing CONCURRENT_REQUESTS which is by default 16 to a higher number.
as per scrapy docs:
The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain.
Note that in some cases this results in hardware bottlenecks, so try not to increase them by a lot. I'd recommend gradually increasing this value and observing system stats (CPU/Network).
I'm having issues with my scrapy project. I want to extract all adds on the page in a list and then iterate over that list to extract and save data for every add. I'm sure I'm doing something terribly wrong and yet I don't know what. I suspect the problem is with the .extract_first() command but I'm calling that on a single object in the list not the whole response. As of right now the spider is only extracting the first data that conforms to the xpath that it finds on the page.
Here is the code:
class OddajastanovanjeljmestoSpider(scrapy.Spider):
name = 'OddajaStanovanjeLjMesto'
allowed_domains = ['www.nepremicnine.net']
start_urls = ['https://www.nepremicnine.net/oglasi-oddaja/ljubljana-mesto/stanovanje/']
def parse(self, response):
oglasi = response.xpath('//div[#itemprop="item"]')
for oglas in oglasi:
item = NepremicninenetItem()
item['velikost'] = oglas.xpath('//div[#class="main-data"]/span[#class="velikost"]/text()').extract_first(default="NaN")
item['leto'] = oglas.xpath('//div[#class="atributi"]/span[#class="atribut leto"]/strong/text()').extract_first(default="NaN")
item['zemljisce'] = oglas.xpath('//div[#class="atributi"]/span[#class="atribut"][text()="Zemljišče: "]/strong/text()').extract_first(default="NaN")
request = scrapy.Request("https://www.nepremicnine.net" + response.xpath('//div[#itemprop="item"]/h2[#itemprop="name"]/a[#itemprop="url"]/#href').extract_first(), callback=self.parse_item_page)
request.meta['item'] = item
yield request
next_page_url = response.xpath('//div[#id="pagination"]//a[#class="next"]/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_item_page(self, response):
item = response.meta['item']
item['referencnaStevilka'] = response.xpath('//div[#id="opis"]/div[#class="dsc"][preceding-sibling::div[#class="lbl"][text()="Referenčna št.:"]]/strong/text()').extract_first(default="NaN")
item['tipOglasa'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="1"]]/#title').extract_first(default="NaN")
item['cena'] = response.xpath('//div[#class="galerija-container"]/meta[#itemprop="price"]/#content').extract_first(default="NaN")
item['valuta'] = response.xpath('//div[#class="galerija-container"]/meta[#itemprop="priceCurrency"]/#content').extract_first(default="NaN")
item['vrstaNepremicnine'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="5"]]/#title').extract_first(default="NaN")
item['tipNepremicnine'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="6"]]/#title').extract_first(default="NaN")
item['regija'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="2"]]/#title').extract_first(default="NaN")
item['upravnaEnota'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="3"]]/#title').extract_first(default="NaN")
item['obcina'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="4"]]/#title').extract_first(default="NaN")
item['prodajalec'] = response.xpath('//div[#itemprop="seller"]/meta[#itemprop="name"]/#content').extract_first(default="NaN")
yield item
the parse_item_page method works correctly and returns the appropriate data but the parse method just returns the first data that it sees on the page...
Looks like the issue is with your xpath expressions. It looks like you need relative xpath expression inside the iteration which mean they need to start with a "."
item['velikost'] = oglas.xpath(
'.//div[#class="maindata"]/span[#class="velikost"]/text()'
).extract_first(default="NaN")
item['leto'] = oglas.xpath(
'.//div[#class="atributi"]/span[#class="atribut leto"]/strong/text()'
).extract_first(default="NaN")
If you paste a sample HTML code block I might be able to confirm.
I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated
Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]
row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()
I am new to using Scrapy and is trying get all the URLs of the listings on the page using Xpath.
The first xpath works
sel.xpath('//[contains(#class, "attraction_element")]')
but the second xpath is giving an error
get_parsed_string(snode_attraction, '//[#class="property_title"]/a/#href')
What is wrong and how can we fix it?
Scrapy Code
def clean_parsed_string(string):
if len(string) > 0:
ascii_string = string
if is_ascii(ascii_string) == False:
ascii_string = unicodedata.normalize('NFKD', ascii_string).encode('ascii', 'ignore')
return str(ascii_string)
else:
return None
def get_parsed_string(selector, xpath):
return_string = ''
extracted_list = selector.xpath(xpath).extract()
if len(extracted_list) > 0:
raw_string = extracted_list[0].strip()
if raw_string is not None:
return_string = htmlparser.unescape(raw_string)
return return_string
class TripAdvisorSpider(Spider):
name = 'tripadvisor'
allowed_domains = ["tripadvisor.com"]
base_uri = "http://www.tripadvisor.com"
start_urls = [
base_uri + '/Attractions-g155032-Activities-c47-t163-Montreal_Quebec.html'
]
# Entry point for BaseSpider
def parse(self, response):
tripadvisor_items = []
sel = Selector(response)
snode_attractions = sel.xpath('//[contains(#class, "attraction_element")]')
# Build item index
for snode_attraction in snode_attractions:
print clean_parsed_string(get_parsed_string(snode_attraction, '//[#class="property_title"]/a/#href'))
Both are not valid XPath expressions, you need to add the tag names after the //. You can also use a wildcard *:
snode_attractions = sel.xpath('//*[contains(#class, "attraction_element")]')
Note that aside from that you second XPath expression that is used in a loop has to be context specific and start with a dot:
# Build item index
for snode_attraction in snode_attractions:
print clean_parsed_string(get_parsed_string(snode_attraction, './/*[#class="property_title"]/a/#href'))
Also note that you don't need to instantiate a Selector object and ca use response.xpath() shortcut directly.
Note that a more concise and, arguably, more readable version of the same logic implementation would be to use CSS selectors:
snode_attractions = response.css('.attraction_element')
for snode_attraction in snode_attractions:
print snode_attraction.css('.property_title > a::attr("href")').extract_first()