How do I use mechanize to input username and password onto this site?
I deleted and changed my post because my previous one had too much extra information
I've read in other posts that maybe this has to do with javascript, but how do I tell? and what do I do with that information?
import mechanize
import cookielib
url = 'https://www.pin1.harvard.edu/cas/login?service=https%3A%2F%2Fwww.pin1.harvard.edu%2Fpin%2Fauthenticate%3F__authen_application%3DFAS_AC_AUTHENTICATOR'
#req = requests.get(url)
#dom = web.Element(req.text)
#Handles all the browser details
br = mechanize.Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
#self.browser = mechanize.Browser(factory=mechanize.RobustFactory())
#Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.open(url)
#Select First Form
#br.select_form(nr=1)
#br['username'] = '40839852'
#print list(br.forms())[0]
for form in br.forms():
print "Form name:", form.name
print form
break
br.select_form(name= formname)
br[searchname] = term
res = br.submit()
content = res.read()
dom = web.Element(content)
TRACEBACK
ParseError: unexpected '/' char in declaration
---> 32 for form in br.forms():
33 print "Form name:", form.name
34 print form
UPDATE - BASED ON PACO'S SUGGESTION I ADDED...But i still get a traceback.
Python unable to retrieve form with urllib or mechanize
beg = re.search(t, res.read()).span()[1]
res.set_data(res.get_data()[beg:])
br.set_response(response)
br.select_form(nr=0)
<ipython-input-25-bd1b73406b45> in <module>()
28 br.set_response(response)
29
---> 30 br.select_form(nr=0)
31
32
ParseError: unexpected '-' char in declaration
This is how I selected the first form in my code.
br.select_form(nr=0)
#Form fields to populate
br.form['username'] = username
br.form['password'] = password
#Submit the login form
br.submit()
Modify it to suit your needs. The "nr=0" is probably what you're looking for.
But the problem is the DOCTYPE. I tested the following, and it strips it out.
html = br.response().get_data().replace('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd >', '')
response = mechanize.make_response(
html, [("Content-Type", "text/html")],
url, 200, "OK")
br.set_response(response)
I took this straight from the Mechanize FAQ.
Related
I am trying to use mechanize to scrape a website that requires me to log in. Here is the start of me code.
#!/usr/bin/python
#scrape the admissions part of SAFE
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('https://url')
# View available forms
for f in br.forms():
print f
This gives me
<POST https://userstuff application/x-www-form-urlencoded
<HiddenControl(lt=LT-227363-Ja4QpRvdxrbQF0nb7XcR2jQDydH43s) (readonly)>
<HiddenControl(execution=e1s1) (readonly)>
<HiddenControl(_eventId=submit) (readonly)>
<TextControl(username=)>
<PasswordControl(password=)>
<SubmitButtonControl(submit=) (readonly)>
<CheckboxControl(warn=[on])>>
How can I now enter the username and password?
I tried
# Select the first (index zero) form
br.select_form(nr=0)
# User credentials
br.form['username'] = 'username'
br.form['password'] = 'password'
# Login
br.submit()
But that doesn't seem to work.
In the end this worked for me
#!/usr/bin/python
#scraper
import mechanize
import cookielib
from BeautifulSoup import BeautifulSoup
import html2text
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Chrome')]
# The site we will navigate into, handling it's session
br.open('url1')
# View available forms
for f in br.forms():
if f.attrs['id'] == 'fm1':
br.form = f
break
# User credentials
br.form['username'] = 'password'
br.form['password'] = 'username'
# Login
br.submit()
#Now we need to confirm again
br.open('https://url2')
# Select the first (index zero) form
br.select_form(nr=0)
# Login
br.submit()
print(br.open('https:url2').read())
I'd look at the html form rather than what mechanize gives you. Below is an example of a form I've tried to fill out in the past.
<input type="text" name="user_key" value="">
<input type="password" name="user_password">
Below is the code I use to log into that website using the form above
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_refresh(False)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# The site we will navigate into, handling it's session
br.open('https://www.website.com/login')
#select the first form
br.select_form(nr=0)
#user credentials
br['user_key'] = 'myusername#gmail.com'
br['user_password'] = 'mypassword'
# Login
br.submit()
link = 'http://www.website.com/url_i_want_to_scrape'
br.open(link)
response = br.response().read()
print response
Your issue could be that you're either choosing the wrong form giving the incorrect field names
I have just started using python and mechanize and I need some help understanding why this will not work.
import mechanize
username = "<username>"
password = "<password>"
ua = 'Mozilla/5.0 (X11; Linux x86_64; rv:18.0) Gecko/20100101 Firefox/18.0 (compatible;)'
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-Agent', ua), ('Accept', '*/*')]
br.open("https://www.instagram.com/accounts/login/")
br.select_form(nr=0)
br[username] = username
br[password] = password
result = br.submit().read()
f = open('output.html','w')
f.write(result)
f.close()
the error it throws is:
FormNotFoundError: no form matching nr 0
and I cannot figure out what the correct form id is. Even after attempting to list all forms using the following:
print("\n Printing all forms on page...\n")
for form in br.forms():
print("Form Name:", form.name)
print form
print("\n Done printing forms...\n")
the list of forms is empty. How can I find what the forms id is if mechanize cannot see the form itself?
To select the first form, you could do this:
br.form = list(br.forms())[0]
then, to enter the username and password:
br.form["username"] = username
br.form["password"] = password
and then finally, to submit:
req = br.submit()
edit: just noticed you said the list of forms would return empty. This is most likely be because the site is using javascript. You might have better luck with selenium.
Good luck!
I'm trying to use Mechanize to get emails from my Outlook web client, but I'm having troubles logging in. It gives me the errors listed below. I've verified that the user name and password are correct. Any ideas?
Here is my code:
import mechanize
b = mechanize.Browser()
cj = cookielib.LWPCookieJar()
b.set_cookiejar(cj)
b.open('https://mail.example.com/owa/')
br.select_form("logonForm")
b['username'] = 'myname'
b['password'] = 'password'
b.submit()
I can see that form components are being accessed correctly, but after submitting, the login page displays again with two errors:
The user name or password you entered isn't correct. Try entering it again.
Please enable cookies for this Web site.
I thought the b.set_cookiejar(cj) would take care of cookies. Could this be the root of my problem?
import mechanize
import cookielib
br = mechanize.Browser()
br.set_handle_robots( False )
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('https://webmail.server.com')
br.select_form(nr = 0)
br.form['username'] = 'username'
br.form['password'] = 'password'
br.submit()
Use this it works for me
I'm using mechanize any python to log into a website and then iterate through a bunch of URLs. Those URLs are all behind a login-wall so I need to get logged in, then view many of the pages.
Challenge I'm facing:
I'm hitting rate limits so I'm using a lot of time.sleep commands.
I would like to be able to alternate my login credentials, say every 10 pages viewed.
As a very new python user and person who's generally new to web scraping, do you have advice on my code?
Does it make sense to login initially, and then view pages over and over or is there a better way to do that?
def get_Text_From_Links(inputFile,outputFile):
with open(inputFile, "rU") as csvinput, open(outputFile, 'wb') as csvoutput :
reader = csv.reader(csvinput)
writer = csv.writer(csvoutput)
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_refresh(False)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open('https://www.website.com/login')
br.select_form(nr=0)
br['username'] = 'login#gmail.com'
br['password'] = 'pa$$w0rd'
br.submit()
for rownum, i in enumerate(reader):
URL = i[6]
if count == 10:
time.sleep(15)
count = 0
else:
count = count + 1
try:
br.open(URL)
except urllib2.URLError:
print "HTTP ERROR"
html = br.response().read()
if html.find('target text') != -1:
v = 'text found'
i.append(v)
writer.writerow(i)
else:
v = 'text not found'
i.append(v)
writer.writerow(i)
I've got a script set to log into a website. The challenge is that I'm running the script on EC2 and the website is asking for me to do additional verification by sending me a custom code.
I receive the email immediately but need to be able to update that field on the fly.
This is the script
import urllib2
import urllib2
import cookielib
import urllib
import requests
import mechanize
from bs4 import BeautifulSoup
# Browser
br = mechanize.Browser()
# Cookie Jar
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_refresh(False)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# The site we will navigate into, handling it's session
br.open('https://www.website.com/login')
#select the first form
br.select_form(nr=0)
#user credentials
br['user_key'] = 'username#gmail.com'
br['user_password'] = 'somePassw0rd'
# Login
br.submit()
#enter verification code
input_var = raw_input("Enter something: ")
#put verification code in form
br['Verication'] = str(input_var)
#submit form
br.submit()
The challenge for me is that I keep getting an error saying:
AttributeError: mechanize._mechanize.Browser instance has no attribute __setitem__ (perhaps you forgot to .select_form()?)
What can I do to make this run as intended?
after you br.submit() you go straight into
br['Verication'] = str(input_var)
this is incorrect since using br.submit() will make your browser not have a form selected anymore.
after submitting i would try:
for form in br.forms():
print form
to see if there is another form to be selected
read up on the html code on the login site and check to see exactly what happens when you click login. You may have to reselect a form on that same page then assign the verification code to one of the controls