Scrapy - ValueError: Missing scheme in request url: #mw-head - python

I'm getting the following traceback but unsure how to refactor.
ValueError: Missing scheme in request url: #mw-head
Full code:
class MissleSpiderBio(scrapy.Spider):
name = 'missle_spider_bio'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/...']
this is the part giving me issues (I believe)
def parse(self, response):
filename = response.url.split('/')[-1]
table = response.xpath('///div/table[2]/tbody')
rows = table.xpath('//tr')
row = rows[2]
row.xpath('td//text()')[0].extract()
wdata = {}
for row in response.xpath('//* \
[#class="wikitable"]//tbody//tr'):
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
#wdata.append(link)
else:
yield None
#wdata = {}
#wdata['link'] = BASE_URL +
#row.xpath('a/#href').extract() #[0]
wdata['link'] = BASE_URL + link
request = scrapy.Request(wdata['link'],\
callback=self.get_mini_bio, dont_filter=True)
request.meta['item'] = MissleItem(**wdata)
yield request
here is the second part of the code:
def get_mini_bio(self, response):
BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
item = response.meta['item']
item['image_urls'] = []
img_src = response.xpath('//table[contains(#class, \
"infobox")]//img/#src')
if img_src:
item['image_urls'] = ['http:' + img_src[0].extract()]
mini_bio = ''
paras = response.xpath('//*[#id="mw-content-text"]/p[text()\
or normalize-space(.)=""]').extract()
for p in paras:
if p =='<p></p>':
break
mini_bio += p
mini_bio = mini_bio.replace('href="/wiki', 'href="' + \
BASE_URL + '/wiki')
mini_bio = mini_bio.replace('href="#', item['link'] + '#')
item['mini_bio'] = mini_bio
yield item
I tried refactoring but am now getting a:
ValueError: Missing scheme in request url: #mw-head
any help would be immensely appreciated

Looks like you were on the right track with the commented out [0].
xpath().extract() #returns a list of strings
You need to select the string with [0]

row.xpath('a/#href').extract()
That expression evaluates to a list NOT a string. When you pass the URL to the request object, scrapy expects a string, not a list
To fix this, you have a few options:
You can use LinkExtractors which will allow you to search a page for links and automatically create scrapy request objects for those links:
https://doc.scrapy.org/en/latest/topics/link-extractors.html
OR
You could run a for loop and go through each of the links:
from scrapy.spiders import Request
for link in response.xpath('//a/#href'):
link = link.extract()
if((link.strip() != '')):
yield Request(link, callback=self.parse)
else:
yield None
You can add whatever string filters you want to that code
OR
If you just want the first link, you can use .extract_first() instead of .extract()

Related

collecting multiple data from multiple requests into one item in scrapy

basically I have a website that contains clothing items. I am starting my spider where I have all the items and I am looping over them one by one, and entering the item by taking the url and accessing the page of it. then, I am trying to get the values of the images (.jpeg url files) and returning them ( each item has multiple colors so I am trying to take all the images of all the colors of this specific item ). the problem is my code right now returns the url of the colors on each line. what I want to do is return all the colors urls of the specific item inside 1 line of the json file and then loop to the next item.
my current code:
import scrapy
class USSpider(scrapy.Spider):
name = 'US'
start_urls = ['https://tr.uspoloassn.com/sadece-online-erkek/?attributes_filterable_product_base_type=T-Shirt']
def parse(self, response):
for j in range(int(response.css('.js-product-list-load').xpath("#page").extract_first()) , int(response.css('.js-product-list-load').xpath("#numpages").extract_first())):
l = 'https://tr.uspoloassn.com/sadece-online-erkek/?attributes_filterable_product_base_type=T-Shirt' + '&page=' + str(j)
yield scrapy.Request(url=l, callback=self.parse2)
def parse2(self,response):
for i in range(len(response.css('a.js-product-images-wrapper'))):
link = 'https://tr.uspoloassn.com' + response.css('a.js-product-images-wrapper')[i].attrib['href']
Url = response.urljoin(link)
yield scrapy.Request(Url, callback=self.parse3)
def parse3(self,response):
colors = list(set(response.xpath('//*[#class="js-variant-area "]').css('ul li').xpath("//a[#class='js-variant ']").xpath(
"#data-value").extract()))
link = response.url
arnold = []
for i in colors:
if i[0].lower() == 'v':
url = link + '?integration_color=' + i
yield scrapy.Request(url,callback=self.parseImage)
def parseImage(self, response):
yield{
'image links': response.css("a.js-product-thumbnail").xpath("#data-image").extract()
}

Another Scrapy Question: Output to Console but not to .json

This is another newbie scrapy question:
When I first started with the scrapy tutorial linked here:
https://docs.scrapy.org/en/latest/intro/tutorial.html
I can crawl a webpage and then output the scraped content to a json file. But when I modify the tutorial to add a few rules like:
traversal depth
and memory so it doesn't traverse already visited pages again.
The output to the json stops although I can still see the output on the console. Can someone give me pointers on what I am doing wrong? The modifications can be seen below:
class QuotesSpider(scrapy.Spider):
name = "quotes"
#allowed_domains = allowed_domain_list
start_urls = input_domain_list
max_depth = 1
invalid_url = []
def parse(self, response):
from_url = ''
from_text = ''
depth = 0
# Extract the meta information from the response, if any
if 'text' in response.meta:
from_text = response.meta['text']
if 'depth' in response.meta:
depth = response.meta['depth']
if 'visited' in response.meta:
visited_dict = response.meta['visited']
else:
visited_dict = {}
if response.status == 404:
self.invalid_url.append(response.url)
print('*'*80)
print('INVALID LINK')
print('*'*80)
else:
page = response.url.split("/")[-2]
web_page = response.request.url
ext_text = ' '.join([item.strip() for item in
response.xpath('//body//text()').extract() if item.strip()])
visited = visited_dict.get('{0}'.format(web_page))
print('-'*80)
print('VALID LINK; Depth: {0}; Visited: {1}'.format(depth, visited))
print('-'*80)
yield {'text': ext_text,
'source': web_page}
if not visited and depth <= self.max_depth:
for selector in response.xpath('//a/#href'):
if selector is not None:
link = selector.get()
request = response.follow(link, callback=self.parse)
request.meta['visited'] = visited_dict
request.meta['visited'].update({'{0}'.format(web_page): 1})
request.meta['depth'] = depth + 1
print('*'*80)
print(link, request.meta['visited'])
print('*' * 80)
yield request

How to scrape website with multiple pages using scrapy

I am trying to scrape this website (that has multiple pages), using scrapy. the problem is that I can't find the next page URL.
Do you have an idea on how to scrape a website with multiple pages (with scrapy) or how to solve the error I'm getting with my code?
I tried the code below but it's not working:
class AbcdspiderSpider(scrapy.Spider):
"""
Class docstring
"""
name = 'abcdspider'
allowed_domains = ['abcd-terroir.smartrezo.com']
alphabet = list(string.ascii_lowercase)
url = "https://abcd-terroir.smartrezo.com/n31-france/annuaireABCD.html?page=1&spe=1&anIDS=31&search="
start_urls = [url + letter for letter in alphabet]
main_url = "https://abcd-terroir.smartrezo.com/n31-france/"
crawl_datetime = str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
start_time = datetime.datetime.now()
def parse(self, response):
self.crawler.stats.set_value("start_time", self.start_time)
try:
page = response.xpath('//div[#class="pageStuff"]/span/text()').get()
page_max = get_num_page(page)
for index in range(page_max):
producer_list = response.xpath('//div[#class="clearfix encart_ann"]/#onclick').getall()
for producer in producer_list:
link_producer = self.main_url + producer
yield scrapy.Request(url=link_producer, callback=self.parse_details)
next_page_url = "/annuaireABCD.html?page={}&spe=1&anIDS=31&search=".format(index)
if next_page_url is not None:
yield scrapy.Request(response.urljoin(self.main_url + next_page_url))
except Exception as e:
self.crawler.stats.set_value("error", e.args)
I am getting this error:
'error': ('range() integer end argument expected, got unicode.',)
The error is here:
page = response.xpath('//div[#class="pageStuff"]/span/text()').get()
page_max = get_num_page(page)
The range function expected an integer value (1,2,3,4, etc) not an unicode string ('Page 1 / 403'
)
My proposal for the range error is
page = response.xpath('//div[#class="pageStuff"]/span/text()').get().split('/ ')[1]
for index in range(int(page)):
#your actions

Scrapy - unable to make additional request in XMLFeedSpider

I have a scrapy spider that uses XMLFeedSpider. As well as the data returned for each node in parse_node(), I also need to make an additional request to get more data. The only issue, is if I yield an additional request from parse_node() nothing gets returned at all:
class MySpidersSpider(XMLFeedSpider):
name = "myspiders"
namespaces = [('g', 'http://base.google.com/ns/1.0')]
allowed_domains = {"www.myspiders.com"}
start_urls = [
"https://www.myspiders.com/productMap.xml"
]
iterator = 'iternodes'
itertag = 'item'
def parse_node(self, response, node):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
id = node.xpath('id/text()').extract()
title = node.xpath('title/text()').extract()
link = node.xpath('link/text()').extract()
image_link = node.xpath('g:image_link/text()').extract()
gtin = node.xpath('g:gtin/text()').extract()
product_type = node.xpath('g:product_type/text()').extract()
price = node.xpath('g:price/text()').extract()
sale_price = node.xpath('g:sale_price/text()').extract()
availability = node.xpath('g:availability/text()').extract()
item = MySpidersItem()
item['id'] = id[0]
item['title'] = title[0]
item['link'] = link[0]
item['image_link'] = image_link[0]
item['gtin'] = gtin[0]
item['product_type'] = product_type[0]
item['price'] = price[0]
item['sale_price'] = '' if len(sale_price) == 0 else sale_price[0]
item['availability'] = availability[0]
yield Request(item['link'], callback=self.parse_details, meta={'item': item})
def parse_details(self, response):
item = response.meta['item']
item['price_per'] = 'test'
return item
If I change the last line of parse_node() to return item it works fine (without setting price_per in the item, naturally).
Any idea what I'm doing wrong?
Have you tried checking the contents of item['link']? If it is a relative link (example: /products?id=5), the URL won't return anything and the request will fail. You need to make sure it's a resolvable link (example: https://www.myspiders.com/products?id=5).
I discovered the issue - I was limiting the number of items processed in my parse_node() function. However, because of the limit, my spider was terminating prior to the request being made. Moving the code to limit the item processed to my parse_details() function resolves the issue:
def parse_details(self, response):
if(self.settings['CLOSESPIDER_ITEMCOUNT'] and int(self.settings['CLOSESPIDER_ITEMCOUNT']) == self.item_count):
raise CloseSpider('CLOSESPIDER_ITEMCOUNT limit reached - ' + str(self.settings['CLOSESPIDER_ITEMCOUNT']))
else:
self.item_count += 1
item = response.meta['item']
item['price_per'] = 'test'
return item

RegEx to get URLs from body string

So I was just wondering what my getURLs function's issue might be. I'm trying to get all urls from within the containing body's string.
My crawler isn't crawling anything because my input urls are invalid.
# Get all URLs contained within the body string
def getURLs(body):
urls = []
tempArr = body.split("a href=")
index = 1
for part in tempArr:
if part[0] == '"':
while (part[index] != '"' and index < len(part)):
index += 1
if index < len(part):
urls.append(part[1:index-1])
index = 1
return urls
# Open file which contains input urls
with open("test_urls.txt","rU") as infile:
urls = [row.strip("\n") for row in infile]
class BackpageSpider(CrawlSpider):
name = 'backpage'
allowed_domains = ['backpage.com']
start_urls = urls
def parse(self,response):
#print response.url
if response.status < 600:
# all_links = response.xpath("//div[contains(#class,'cat')]/a/#href").extract()
#all the links FOR THE ESCORTS on whatever page we're on
todays_links = []
#all the links for today's date
backpage_date = backpage_date_today()
yesterday_date = backpage_date_yesterday()
if backpage_date in response.body:
todays_section = response.body.split(backpage_date)[1].split(yesterday_date)[0].decode('utf-8')
# todays_links = todays_section.xpath("//div[contains(#class,'cat')]/a/#href").extract
todays_links = getURLs(todays_section)
# for url in todays_links:
# todays_links.append(url)
# for url in all_links:
# if url in todays_section:
# todays_links.append(url)
for url in todays_links:
yield scrapy.Request(url,callback=self.parse_ad_into_content)####HERE
for url in set(response.xpath('//a[#class="pagination next"]/#href').extract()):
yield scrapy.Request(url,callback=self.parse)
else:
time.sleep(600)
yield scrapy.Request(response.url,callback=self.parse)
def parse_ad_into_content(self,response):
#ipdb.set_trace()
item = items.BackpageScrapeItem(
url=response.url,
backpage_id=response.url.split('.')[0].split('/')[2].encode('utf-8'),
text = response.body,
posting_body= response.xpath("//div[#class='postingBody']").extract()[0].encode('utf-8'),
date = datetime.utcnow()-timedelta(hours=5),
posted_date = response.xpath("//div[#class='adInfo']/text()").extract()[0].encode('utf-8'),
posted_age = response.xpath("//p[#class='metaInfoDisplay']/text()").extract()[0].encode('utf-8'),
posted_title = response.xpath("//div[#id='postingTitle']//h1/text()").extract()[0].encode('utf-8')
)
return item
The web page is: http://grandisland.backpage.com/FemaleEscorts/?layout=date

Categories

Resources