Scrapy iterating over list of elements on page - python

I'm having issues with my scrapy project. I want to extract all adds on the page in a list and then iterate over that list to extract and save data for every add. I'm sure I'm doing something terribly wrong and yet I don't know what. I suspect the problem is with the .extract_first() command but I'm calling that on a single object in the list not the whole response. As of right now the spider is only extracting the first data that conforms to the xpath that it finds on the page.
Here is the code:
class OddajastanovanjeljmestoSpider(scrapy.Spider):
name = 'OddajaStanovanjeLjMesto'
allowed_domains = ['www.nepremicnine.net']
start_urls = ['https://www.nepremicnine.net/oglasi-oddaja/ljubljana-mesto/stanovanje/']
def parse(self, response):
oglasi = response.xpath('//div[#itemprop="item"]')
for oglas in oglasi:
item = NepremicninenetItem()
item['velikost'] = oglas.xpath('//div[#class="main-data"]/span[#class="velikost"]/text()').extract_first(default="NaN")
item['leto'] = oglas.xpath('//div[#class="atributi"]/span[#class="atribut leto"]/strong/text()').extract_first(default="NaN")
item['zemljisce'] = oglas.xpath('//div[#class="atributi"]/span[#class="atribut"][text()="Zemljišče: "]/strong/text()').extract_first(default="NaN")
request = scrapy.Request("https://www.nepremicnine.net" + response.xpath('//div[#itemprop="item"]/h2[#itemprop="name"]/a[#itemprop="url"]/#href').extract_first(), callback=self.parse_item_page)
request.meta['item'] = item
yield request
next_page_url = response.xpath('//div[#id="pagination"]//a[#class="next"]/#href').extract_first()
if next_page_url:
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_item_page(self, response):
item = response.meta['item']
item['referencnaStevilka'] = response.xpath('//div[#id="opis"]/div[#class="dsc"][preceding-sibling::div[#class="lbl"][text()="Referenčna št.:"]]/strong/text()').extract_first(default="NaN")
item['tipOglasa'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="1"]]/#title').extract_first(default="NaN")
item['cena'] = response.xpath('//div[#class="galerija-container"]/meta[#itemprop="price"]/#content').extract_first(default="NaN")
item['valuta'] = response.xpath('//div[#class="galerija-container"]/meta[#itemprop="priceCurrency"]/#content').extract_first(default="NaN")
item['vrstaNepremicnine'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="5"]]/#title').extract_first(default="NaN")
item['tipNepremicnine'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="6"]]/#title').extract_first(default="NaN")
item['regija'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="2"]]/#title').extract_first(default="NaN")
item['upravnaEnota'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="3"]]/#title').extract_first(default="NaN")
item['obcina'] = response.xpath('//li[#itemprop="itemListElement"]/a[../meta[#content="4"]]/#title').extract_first(default="NaN")
item['prodajalec'] = response.xpath('//div[#itemprop="seller"]/meta[#itemprop="name"]/#content').extract_first(default="NaN")
yield item
the parse_item_page method works correctly and returns the appropriate data but the parse method just returns the first data that it sees on the page...

Looks like the issue is with your xpath expressions. It looks like you need relative xpath expression inside the iteration which mean they need to start with a "."
item['velikost'] = oglas.xpath(
'.//div[#class="maindata"]/span[#class="velikost"]/text()'
).extract_first(default="NaN")
item['leto'] = oglas.xpath(
'.//div[#class="atributi"]/span[#class="atribut leto"]/strong/text()'
).extract_first(default="NaN")
If you paste a sample HTML code block I might be able to confirm.

Related

collecting multiple data from multiple requests into one item in scrapy

basically I have a website that contains clothing items. I am starting my spider where I have all the items and I am looping over them one by one, and entering the item by taking the url and accessing the page of it. then, I am trying to get the values of the images (.jpeg url files) and returning them ( each item has multiple colors so I am trying to take all the images of all the colors of this specific item ). the problem is my code right now returns the url of the colors on each line. what I want to do is return all the colors urls of the specific item inside 1 line of the json file and then loop to the next item.
my current code:
import scrapy
class USSpider(scrapy.Spider):
name = 'US'
start_urls = ['https://tr.uspoloassn.com/sadece-online-erkek/?attributes_filterable_product_base_type=T-Shirt']
def parse(self, response):
for j in range(int(response.css('.js-product-list-load').xpath("#page").extract_first()) , int(response.css('.js-product-list-load').xpath("#numpages").extract_first())):
l = 'https://tr.uspoloassn.com/sadece-online-erkek/?attributes_filterable_product_base_type=T-Shirt' + '&page=' + str(j)
yield scrapy.Request(url=l, callback=self.parse2)
def parse2(self,response):
for i in range(len(response.css('a.js-product-images-wrapper'))):
link = 'https://tr.uspoloassn.com' + response.css('a.js-product-images-wrapper')[i].attrib['href']
Url = response.urljoin(link)
yield scrapy.Request(Url, callback=self.parse3)
def parse3(self,response):
colors = list(set(response.xpath('//*[#class="js-variant-area "]').css('ul li').xpath("//a[#class='js-variant ']").xpath(
"#data-value").extract()))
link = response.url
arnold = []
for i in colors:
if i[0].lower() == 'v':
url = link + '?integration_color=' + i
yield scrapy.Request(url,callback=self.parseImage)
def parseImage(self, response):
yield{
'image links': response.css("a.js-product-thumbnail").xpath("#data-image").extract()
}

Exporting Scrapy items with Selenium scraped content

I'm trying to scrape a website using Scrapy and Selenium, and everything works just fine except the "yield item" part of the code.
In the "def parse_product(self, response)" part, I'm using Selenium find_element_by_css_selector to fill a list and then use the "for element in zip(list1, list2, etc)" approach to generate my items. I have also set up a Pipeline to export the result into a csv.
The problem is that although my spider is scraping the objects correctly (I have tested it with some prints along the way), the item creation part is not working and I'm getting an empty csv.
I have tried another approach that works, but is too slow. It consists in defining a Middleware to pass the request through Selenium, load the page source code and return a HtmlResponse. Then, I just simply use the response.css() method to fill the lists, the same approach to generate the items and the same Pipeline to export it as csv.
spider.py
def __init__(self):
#Headless Option
opt = Options()
opt.headless = True
## get the Firefox profile object
prf = FirefoxProfile()
## Disable CSS
prf.set_preference('permissions.default.stylesheet', 2)
## Disable images
prf.set_preference('permissions.default.image', 2)
self.browser = webdriver.Firefox(firefox_options = opt, firefox_profile = prf)
def parse(self, response):
self.browser.get(response.url)
print('Current URL: ' + response.request.url)
# Find the total number of pages
# Go to last page (click on '>>')
self.browser.find_element_by_css_selector('li.ais-pagination__item:nth-child(9) a:nth-child(1)').click()
n = self.browser.find_element_by_css_selector('li.ais-pagination__item:nth-child(7) a:nth-child(1)').get_attribute('text')
pages = int(n.strip())
# Go back to first page (click on '<<')
self.browser.find_element_by_css_selector('li.ais-pagination__item:nth-child(1) a:nth-child(1)').click()
# Scrape product links
pdcts = []
href = []
i=1
while i<=n:
# Append Product Links
atag = self.browser.find_elements_by_css_selector('div[class*="ph-proucts"] a')
for a in atag:
href.append(a.get_attribute('href'))
prd = list(set(href))
for p in prd:
pdcts.append(p)
# Load new page of products (Click '>')
self.browser.find_element_by_css_selector('li.ais-pagination__item:nth-child(8) a:nth-child(1)').click()
i+=1
for link in pdcts:
yield scrapy.Request(url = link, callback = self.parse_product)
def parse_product(self, response):
self.browser.get(response.url)
print("Current URL: " + response.request.url)
#Item Initializator
name = []
#Item Filling
try:
n = self.browser.find_element_by_css_selector('div[class="flex xs12 sm12 md12"] h1').text
name.append(string.capwords(n))
except:
n1 = response.request.url
n = n1.split('products/')[1].replace('-',' ')
name.append(string.capwords(n))
#Item Creation
for i in zip(name):
self.browser.quit()
item = ProjectItem()
item['name'] = i[0]
yield item
The expected result is a csv with the scraped information but I'm getting an empty one instead.
Could anyone help me with this please? I would really appreciate it.

Python Scrapy Encoding utf-8

I have written a program to scrap some data from the web as below.
import scrapy
class JPItem(scrapy.Item):
question_content = scrapy.Field()
best_answer = scrapy.Field()
class JPSpider(scrapy.Spider):
name = "jp"
allowed_domains = ['chiebukuro.yahoo.co.jp']
def start_requests(self):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year=2004&month=1&day=1&page=1'
yield scrapy.Request(url, self.parse)
def parse(self, response):
if str(response.css("div.qa-list small::text").extract()) == '条件に一致する質問はみつかりませんでした。':
for y in range (2004,2007):
for m in range (1,13):
for d in range(1,32):
url = 'https://chiebukuro.yahoo.co.jp/dir/list.php?did=2078297790&flg=1&sort=3&type=list&year='+ str(y) + '&month=' + str(m) + '&day=' + str(d) +'&page=1';
yield scrapy.Request(url, self.parse)
else:
for i in range(0,40):
url = response.xpath('//ul[#id="qalst"]/li/dl/dt/a/#href')[i].extract()
yield scrapy.Request(url, self.parse_info)
next_page = response.css("div.qa-list p.flip a.next::attr(href)").extract_first()
if next_page is not None:
yield scrapy.Request(next_page, self.parse)
def parse_info(self, response):
item = JPItem()
item['question_content'] = "\"" + ''.join(response.css("div.mdPstdQstn div.ptsQes p:not([class])").extract() + response.css("div.mdPstdQstn div.ptsQes p.queTxt::text").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","").replace("<br>","") + "\""
item['best_answer'] = "\"" + ''.join(response.css("div.mdPstdBA div.ptsQes p.queTxt::text").extract() + response.css("div.mdPstdBA div.ptsQes p:not([class])").extract()).replace("\n","\\n").replace("\r","\\r").replace("\t","\\t").replace("<p>","").replace("</p>","") + "\""
yield item
I found that there should be a problem with this line
if str(response.css("div.qa-list small::text").extract()) ==
'条件に一致する質問はみつかりませんでした。':
since when I run the program it cannot detect this condition, even if the extracted test should be the equal as stated, it will just skip to the Else condition. I have tried to use .encode("utf-8") but it seems could not solve the issue. Would anyone can help to provide some suggestions on this issue?
Greatly appreciated.
As #paul trmbth pointed out, what you are trying to do here is a compare a list with a string, which is logically incorrect and would always return False. So the options presented are to compare the string with :
response.css("div.qa-list small::text").extract_first() which gives the first extracted element, (here, a string) which is the preferred way since using extract_first() avoids an IndexError and returns None when it doesn’t find any element matching the selection
Since extract() returns a list, just doing response.css("div.qa-list small::text").extract[0] will work and provide the 1st element.
And incase you got a list of more than one strings and you want to take all the text together and do some operation with it, a simple method to turn all of them to a single string is to do ''.join(response.css("div.qa-list small::text"))
In your case using the 1st method is apt, and need not worry about utf-8 conversions as python will handle those internally.

Scrapy (Python): Iterating over 'next' page without multiple functions

I am using Scrapy to grab stock data from Yahoo! Finance.
Sometimes, I need to loop over several pages, 19 in this example , in order to get all of the stock data.
Previously (when I knew there would only be two pages), I would use one function for each page, like so:
def stocks_page_1(self, response):
returns_page1 = []
#Grabs data here...
current_page = response.url
next_page = current_page + "&z=66&y=66"
yield Request(next_page, self.stocks_page_2, meta={'returns_page1': returns_page1})
def stocks_page_2(self, response):
# Grab data again...
Now, instead of writing 19 or more functions, I was wondering if there was a way I could loop through an iteration using one function to grab all data from all pages available for a given stock.
Something like this:
for x in range(30): # 30 was randomly selected
current_page = response.url
# Grabs Data
# Check if there is a 'next' page:
if response.xpath('//td[#align="right"]/a[#rel="next"]').extract() != ' ':
u = x * 66
next_page = current_page + "&z=66&y={0}".format(u)
# Go to the next page somehow within the function???
Updated Code:
Works, but only returns one page of data.
class DmozSpider(CrawlSpider):
name = "dnot"
allowed_domains = ["finance.yahoo.com", "http://eoddata.com/"]
start_urls = ['http://finance.yahoo.com/q?s=CAT']
rules = [
Rule(LinkExtractor(restrict_xpaths='//td[#align="right"]/a[#rel="next"]'),
callback='stocks1',
follow=True),
]
def stocks1(self, response):
returns = []
rows = response.xpath('//table[#class="yfnc_datamodoutline1"]//table/tr')[1:]
for row in rows:
cells = row.xpath('.//td/text()').extract()
try:
values = cells[-1]
try:
float(values)
returns.append(values)
except ValueError:
continue
except ValueError:
continue
unformatted_returns = response.meta.get('returns_pages')
returns = [float(i) for i in returns]
global required_amount_of_returns, counter
if counter == 1 and "CAT" in response.url:
required_amount_of_returns = len(returns)
elif required_amount_of_returns == 0:
raise CloseSpider("'Error with initiating required amount of returns'")
counter += 1
print counter
# Iterator to calculate Rate of return
# ====================================
if data_intervals == "m":
k = 12
elif data_intervals == "w":
k = 4
else:
k = 30
sub_returns_amount = required_amount_of_returns - k
sub_returns = returns[:sub_returns_amount]
rate_of_return = []
if len(returns) == required_amount_of_returns or "CAT" in response.url:
for number in sub_returns:
numerator = number - returns[k]
rate = numerator/returns[k]
if rate == '':
rate = 0
rate_of_return.append(rate)
k += 1
item = Website()
items = []
item['url'] = response.url
item['name'] = response.xpath('//div[#class="title"]/h2/text()').extract()
item['avg_returns'] = numpy.average(rate_of_return)
item['var_returns'] = numpy.cov(rate_of_return)
item['sd_returns'] = numpy.std(rate_of_return)
item['returns'] = returns
item['rate_of_returns'] = rate_of_return
item['exchange'] = response.xpath('//span[#class="rtq_exch"]/text()').extract()
item['ind_sharpe'] = ((numpy.average(rate_of_return) - RFR) / numpy.std(rate_of_return))
items.append(item)
yield item
You see, a parse callback is just a function that takes the response and returns or yields either Items or Requests or both. There is no issue at all with reusing these callbacks, so you can just pass the same callback for every request.
Now, you could pass the current page info using the Request meta but instead, I'd leverage the CrawlSpider to crawl across every page. It's really easy, start generating the Spider with the command line:
scrapy genspider --template crawl finance finance.yahoo.com
Then write it like this:
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
Scrapy 1.0 has deprecated the scrapy.contrib namespace for the modules above, but if you're stuck with 0.24, use scrapy.contrib.linkextractors and scrapy.contrib.spiders.
from yfinance.items import YfinanceItem
class FinanceSpider(CrawlSpider):
name = 'finance'
allowed_domains = ['finance.yahoo.com']
start_urls = ['http://finance.yahoo.com/q/hp?s=PWF.TO&a=04&b=19&c=2005&d=04&e=19&f=2010&g=d&z=66&y=132']
rules = (
Rule(LinkExtractor(restrict_css='[rel="next"]'),
callback='parse_items',
follow=True),
)
LinkExtractor will pick up the links in the response to follow, but it can be limited with XPath (or CSS) and regular expressions. See documentation for more.
Rules will follow the links and call the callback on every response. follow=True will keep extracting links on every new response, but it can be limited by depth. See documentation again.
def parse_items(self, response):
for line in response.css('.yfnc_datamodoutline1 table tr')[1:-1]:
yield YfinanceItem(date=line.css('td:first-child::text').extract()[0])
Just yield the Items, since Requests for the next pages will be handled by the CrawlSpider Rules.

How do I request callback on a URL that I first scraped to get?

Just started toying around with scrapy for a bit to help scrape some fantasy basketball stats. My main problem is in my spider - how do I scrape the href attribute of a link and then callback another parser on that url?
I looked into link extractors, and I think this might be my solution but I'm not sure. I've re-read it over and over again, and still am confused on where to start. The following is the code I have so far.
def parse_player(self, response):
player_name = "Steven Adams"
sel = Selector(response)
player_url = sel.xpath("//a[text()='%s']/#href" % player_name).extract()
return Request("http://sports.yahoo.com/'%s'" % player_url, callback = self.parse_curr_stats)
def parse_curr_stats(self, response):
sel = Selector(response)
stats = sel.xpath("//div[#id='mediasportsplayercareerstats']//table[#summary='Player']/tbody/tr[last()-1]")
items =[]
for stat in stats:
item = player_item()
item['fgper'] = stat.xpath("td[#title='Field Goal Percentage']/text()").extract()
item['ftper'] = stat.xpath("td[#title='Free Throw Percentage']/text()").extract()
item['treys'] = stat.xpath("td[#title='3-point Shots Made']/text()").extract()
item['pts'] = stat.xpath("td[#title='Points']/text()").extract()
item['reb'] = stat.xpath("td[#title='Total Rebounds']/text()").extract()
item['ast'] = stat.xpath("td[#title='Assists']/text()").extract()
item['stl'] = stat.xpath("td[#title='Steals']/text()").extract()
item['blk'] = stat.xpath("td[#title='Blocked Shots']/text()").extract()
item['tov'] = stat.xpath("td[#title='Turnovers']/text()").extract()
item['fga'] = stat.xpath("td[#title='Field Goals Attempted']/text()").extract()
item['fgm'] = stat.xpath("td[#title='Field Goals Made']/text()").extract()
item['fta'] = stat.xpath("td[#title='Free Throws Attempted']/text()").extract()
item['ftm'] = stat.xpath("td[#title='Free Throws Made']/text()").extract()
items.append(item)
return items
So as you can see, in the first parse function, you're given a name, and you look for the link on the page that will guide you to their individual page, which is stored in "player_url". How do I then go to that page and run the 2nd parser on it?
I feel as if I am completely glossing over something and if someone could shed some light it would be greatly appreciated!
When you want to send a Request object, just use yield rather than return like this:
def parse_player(self, response):
......
yield Request(......)
If there are many Request objects that you want to send in a single parse method, a best practic is like this:
def parse_player(self, response):
......
res_objs = []
# then add every Request object into 'res_objs' list,
# and in the end of the method, do the following:
for req in res_objs:
yield req
I think when the scrapy spider is running, it will handle requests under the hood like this:
# handle requests
for req_obj in self.parse_play():
# do something with *Request* object
So just remember use yield to send Request objects.

Categories

Resources