I am trying to run the code using python(scrapy) but there is no output.
I am also tyring to login to a webpage, let me know if there are any errors
The code i am using is this:
class MySpider(Spider):
def init(self, login, password):
link = "http://freeerisa.benefitspro.com"
self.login = login
self.password = password
self.cj = cookielib.CookieJar()
self.opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(self.cj)
)
self.loginToFreeErissa()
self.loginToFreeErissa()
def loginToFreeErissa(self):
login_data = urllib.urlencode({
'MainContent_mainContent_txtEmail' : self.login,
'MainContent_mainContent_txtPassword' : self.password,
})
response = self.opener.open(link + "/login.aspx", login_data)
return ''.join(response.readlines())
def after_login(self, response):
if "Error while logging in" in response.body:
self.logger.error("Login failed!")
else:
url = [link + "/5500/plandetails.aspx?Ein=042088633",
link + "/5500/plandetails.aspx?Ein=046394579"]
for u in url:
g_data =soup.find_all("span")
for item in g_data:
return item.text
I tried calling the function and this is the error I received:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "C:\ProgramData\Anaconda3\lib\site-packages\scrapy\spiders_init_.py",line 30,
in init raise ValueError("%s must have a name" % type(self).__name__)
ValueError: MySpider must have a name
There is no output because you don't call anything.
In other worlds, you defined what is MySpider but you didn't used it.
Here's a link that could help you
Change your code to
class MySpider(Spider):
name = 'myspider'
def init(self, login, password):
link = "http://freeerisa.benefitspro.com"
and run your spider by
scrapy crawl myspider
for more information
The error message could not be plainer: The spider must have a name. There is no name in the code you have posted. This is basic to creating a spider in Scrapy. Also, your Python spacing is terrible, you need an editor with Pylint or something that will tell you about PEP8.
Related
Below is my source code, I am getting - KeyError: 'No input element with the name None' error.
import re
import json
from loginform import fill_login_form
class wikihowSpider(scrapy.Spider):
name = "wikihow"
start_urls = ['http://www.wikihow.com/Category:Arts-and-Entertainment']
login_url = 'https://www.wikihow.com/Main-Page#wh-dialog-login'
def start_requests(self):
yield scrapy.Request(self.login_url, self.parse_login)
def parse_login(self, response):
print('Here')
data, url, method = fill_login_form(response.url, response.body,
'username', 'password')
return scrapy.FormRequest(url, formdata=dict(data),
method=method, callback=self.parse_main)
def parse_main(self, response):
# crawl
My use-case is to log-in and then crawl given a list of starting urls.
I also tried using versions of examples mentioned here but kept getting an error like Ignoring response <404 https://www.wikihow.com/wikiHowTo?search=&wpName=username&wpPassword=password&wpRemember=1&wploginattempt=Log+in>: HTTP status code is not handled or not allowed errors. Any help would be appreciated!
I am trying to get a json field with key "longName" with scrapy but I am receiving the error: "Spider must return request, item, or None, got 'str'".
The JSON I'm trying to scrape looks something like this:
{
"id":5355,
"code":9594,
}sadsadsd
This is my code:
import scrapy
import json
class NotesSpider(scrapy.Spider):
name = 'notes'
allowed_domains = ['blahblahblah.com']
start_urls = ['https://blahblahblah.com/api/123']
def parse(self, response):
data = json.loads(response.body)
yield from data['longName']
I get the above error when I run "scrapy crawl notes" in prompt. Anyone can point me in the right direction?
If you only want longName modifying your parse method like this should do the trick:
def parse(self, response):
data = json.loads(response.body)
yield {"longName": data["longName"]}
I am trying to use the below code to search for a keyword in a given URL (internal website at work) and I keep getting the error. It works fine on public site.
from html.parser import HTMLParser
import urllib.request
class CustomHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tag_flag = False
self.tag_line_num = 0
self.tag_string = 'temporary_tag'
def initiate_vars(self, tag_string):
self.tag_string = tag_string
def handle_starttag(self, tag, attrs):
#if tag == 'tag_to_search_for':
if tag == self.tag_string:
self.tag_flag = True
self.tag_line_num = self.getpos()
if __name__== '__main__':
#simple_str = 'string_to_search_for'
simple_str = 'Host Status'
my_url = 'TEST_URL'
parser_obj = CustomHTMLParser()
#parser_obj.initiate_vars('tag_to_search_for')
parser_obj.initiate_vars('script')
#html_file = open('location_of_html_file//file.html')
my_request = urllib.request.Request(my_url)
try:
url_data = urllib.request.urlopen(my_request)
except:
print("There was some error opening the URL")
html_str = url_data.read().decode('utf8')
#html_str = html_file.read()
#print (html_str)
html_search_result = html_str.lower().find(simple_str.lower())
if html_search_result != -1:
print ('The word {} was found'.format(simple_str))
else:
print ('The word {} was not found'.format(simple_str))
parser_obj.feed(html_str)
if parser_obj.tag_flag:
print ('Tag {0} was found at position {1}'.format(parser_obj.tag_string, parser_obj.tag_line_num))
else:
print ('Tag {} was not found'.format(parser_obj.tag_string))
but I keep getting the error
There was some error opening the URL
Traceback (most recent call last):
File "C:\TEMP\parse.py", line 40, in <module>
html_str = url_data.read().decode('utf8')
NameError: name 'url_data' is not defined
I believe I already tried using urllib2, using python v3.7
Not sure what to do. Is it worth trying user_agent?
EDIT1: I have now tried the below
>>> import urllib
>>> url = urllib.request.urlopen('https://concernedURL.com')
and I am getting this error "urllib.error.HTTPError: HTTP Error 401: Unauthorized". Should I be using the headers I have from my browser as well as SSL certs?
The problem is that you get an error in the try-block, and that leaves the url_data variable undefined:
try:
# if this errors, no url_data will exist
url_data = urllib.request.urlopen(my_request)
except:
# really bad to catch all exceptions!
print("There was some error opening the URL")
html_str = url_data.read().decode('utf8')
You should probably just remove the try-except, or handle the error better. It's almost never advicable to use the bare except without a specific error since it can create all kinds of problems.
In this case your program should probably just stop running if you cannot open the requested url, since it really doesn't make any sense to try to operate on the url's data if the opening failed in the first place.
I am trying to use Scrapy-splash to click a button on a page that I'm being redirected to.
I have tested manually clicking on the page, and I am redirected to the correct page after I have clicked the button that gives my consent. I have written a small script to click the button when I am redirected to the page, but this is not working.
I have included a snippet of my spider below - am I missing something in my code?:
from sys import path
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
path.append(dir_path)
import scrapy
from scrapy_splash import SplashRequest
script="""
function main(splash)
splash:wait(1)
splash:runjs('document.querySelector("form.consent-form").submit()')
splash:wait(1)
return {
html = splash:html(),
}
end
"""
class FoobarSpider(scrapy.Spider):
name = "foobar"
def start_requests(self):
urls = ['https://uk.finance.yahoo.com/quote/ANTO.L?p=ANTO.L']
for url in urls:
yield SplashRequest(url=url, callback=self.parse,
endpoint='render.html',
args={'wait': 3},
meta = {'yahoo_url': url }
)
def parse(self, response):
url = response.url
with open('temp.html', 'wb') as f:
f.write(response.body)
if 'https://guce.' in url:
print('About to attempt to authenticate ...')
yield SplashRequest(
url,
callback = self.get_price,
endpoint = 'execute',
args = {'lua_source': script, 'timeout': 5},
meta = response.meta
)
else:
self.get_price(response)
def get_price(self, response):
print("Get price called!")
yahoo_price = None
try:
# Get Price ...
temp1 = response.css('div.D\(ib\).Mend\(20px\)')
if temp1 and len(temp1) > 1:
temp2 = temp1[1].css('span')
if len(temp2) > 0:
yahoo_price = temp2[0].xpath('.//text()').extract_first().replace(',','')
if not yahoo_price:
val = response.css('span.Trsdu\(0\.3s\).Trsdu\(0\.3s\).Fw\(b\).Fz\(36px\).Mb\(-4px\).D\(b\)').xpath('.//text()').extract_first().replace(',','')
yahoo_price = val
except Exception as err:
pass
print("Price is: {0}".format(yahoo_price))
def handle_error(self, failure):
pass
How do I fix this so that I can correctly give consent, so I'm directed to the page I want?
Rather than clicking the button, try submitting the form:
document.querySelector("form.consent-form").submit()
I tried running the JavaScript command input.btn.btn-primary.agree").click() in my console and would get an error message "Oops, Something went Wrong" but the page loads when using the above code to submit the form.
Because I'm not in Europe I can't fully recreate your setup but I believe that should get you past the issue. My guess is that this script is interfering with the .click() method.
My spider works, but I can't download the body of the website I crawl in a .html file. If I write self.html_fil.write('test') then it works fine. I don't know how to convert the tulpe to string.
I use Python 3.6
Spider:
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ['google.com']
start_urls = ['http://google.com/']
def __init__(self):
self.path_to_html = html_path + 'index.html'
self.path_to_header = header_path + 'index.html'
self.html_file = open(self.path_to_html, 'w')
def parse(self, response):
url = response.url
self.html_file.write(response.body)
self.html_file.close()
yield {
'url': url
}
Tracktrace:
Traceback (most recent call last):
File "c:\python\python36-32\lib\site-packages\twisted\internet\defer.py", line
653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\Users\kv\AtomProjects\example_project\example_bot\example_bot\spiders
\example.py", line 35, in parse
self.html_file.write(response.body)
TypeError: write() argument must be str, not bytes
Actual problem is you are getting byte code. You need to convert it to string format. there are many ways for converting byte to string format.
You can use
self.html_file.write(response.body.decode("utf-8"))
instead of
self.html_file.write(response.body)
also you can use
self.html_file.write(response.text)
The correct way is to use response.text, and not response.body.decode("utf-8"). To quote documentation:
Keep in mind that Response.body is always a bytes object. If you want the unicode version use TextResponse.text (only available in TextResponse and subclasses).
and
text: Response body, as unicode.
The same as response.body.decode(response.encoding), but the result is cached after the first call, so you can access response.text multiple times without extra overhead.
Note: unicode(response.body) is not a correct way to convert response body to unicode: you would be using the system default encoding (typically ascii) instead of the response encoding.
Taking in consideration responses above, and making it as much pythonic as possible adding the use of the with statement, the example should be rewritten like:
class ExampleSpider(scrapy.Spider):
name = "example"
allowed_domains = ['google.com']
start_urls = ['http://google.com/']
def __init__(self):
self.path_to_html = html_path + 'index.html'
self.path_to_header = header_path + 'index.html'
def parse(self, response):
with open(self.path_to_html, 'w') as html_file:
html_file.write(response.text)
yield {
'url': response.url
}
But the html_file will only accessible from the parse method.