Read page source before POST - python

I want to know if there is a way to POST parameters after reading the page source. Ex: read captcha before posting ID#
My current code:
import requests
id_number = "1"
url = "http://www.submitmyforum.com/page.php"
data = dict(id = id_number, name = 'Alex')
post = requests.post(url, data=data)
There is a captcha that is changeable after every request to http://submitforum.com/page.php (obv not a real site) I would like to read that parameter and submit it to the "data" variable.

As discussed in OP comments, selenium can be used, methods without browser emulation may also exists !
Using Selenium (http://selenium-python.readthedocs.io/) instead of requests module method:
import re
import selenium
from selenium import webdriver
regexCaptcha = "k=.*&co="
url = "http://submitforum.com/page.php"
# Get to the URL
browser = webdriver.Chrome()
browser.get(url)
# Example for getting page elements (using css seletors)
# In this example, I'm getting the google recaptcha ID if present on the current page
try:
element = browser.find_element_by_css_selector('iframe[src*="https://www.google.com/recaptcha/api2/anchor?k"]')
captchaID = re.findall(regexCaptcha, element.get_attribute("src"))[0].replace("k=", "").replace("&co=", "")
captchaFound = True
print "Captcha found !", captchaID
except Exception, ex:
print "No captcha found !"
captchaFound = False
# Treat captcha
# --> Your treatment code
# Enter Captcha Response on page
captchResponse = browser.find_element_by_id('captcha-response')
captchResponse.send_keys(captcha_answer)
# Validate the form
validateButton = browser.find_element_by_id('submitButton')
validateButton.click()
# --> Analysis of returned page if needed

Related

python selenium webscraping (clicking buttons which shows data and then extracting it)

so what I'm trying to do is: https://www.jobbank.gc.ca/jobsearch/jobsearch?sort=D&fsrc=16&fbclid=IwAR2SIG3lbY1S9lO4WilcKw6TxJAJQbFIGYTVE_tOTqYRpb43qM3uYgLWV64, < in this link open all listings and then when it redirects to another page there is a button ( Show how to apply ) when we click on that button there will be shown an email address. So I want to to scrape every job listing title and email address through my code. I already scraped titles and hrefs but have no idea what to do next(e.g clicking on every job listing, then clicking to "Show how to apply" and scraping emails from there). I hope you guys understand what I want to do ( Sorry for my english )
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import os
s = Service('C:\Program Files (x86)\chromedriver.exe')
driver = webdriver.Chrome(service=s)
driver.get('https://www.jobbank.gc.ca/jobsearch/jobsearch?sort=D&fsrc=16&fbclid=IwAR2SIG3lbY1S9lO4WilcKw6TxJAJQbFIGYTVE_tOTqYRpb43qM3uYgLWV64')
# Get titles of Job listings
elements = []
for element in driver.find_elements(By.CLASS_NAME, 'resultJobItem'):
title = element.find_element(By.XPATH, './/*[#class="noctitle"]').text
if title not in elements:
elements.append({'Title': title.split('\n')})
# Get all href
link = driver.find_elements(By.XPATH, './/*[#class="results-jobs"]/article/a')
for links in link:
elements.append({'Link': links.get_attribute('href')})
print(elements)
Looks like you can use their own api with a post request to get the data.
You'll need to scrape the job id.
so for the job on this url: https://www.jobbank.gc.ca/jobsearch/jobposting/35213663
i see that the job id is 1860693. so ill need to post a request like this.
import requests
from bs4 import BeautifulSoup as BS
url = "https://www.jobbank.gc.ca/jobsearch/jobposting/35213663"
jobid = "1860693"
data = {
'seekeractivity:jobid': f'{jobid}',
'seekeractivity_SUBMIT': '1',
'javax.faces.ViewState': 'stateless',
'javax.faces.behavior.event': 'action',
'jbfeJobId': f'{jobid}',
'action': 'applynowbutton',
'javax.faces.partial.event': 'click',
'javax.faces.source': 'seekeractivity',
'javax.faces.partial.ajax': 'true',
'javax.faces.partial.execute': 'jobid',
'javax.faces.partial.render': 'applynow',
'seekeractivity': 'seekeractivity'
}
response = requests.post(url, data)
soup = BS(response.text)
email = soup.a.text
print(email)
this gives me
>> info#taylorlumber.ca
I would store all the links seperately.
So assume the following variable all_links contains all the links. Now,
.
.
.
driver.quit()
link1 = all_links[0] # lets take the example of the first link. youd have to for loop through all the link; for link in links
new_driver = webdriver.Chrome(service=s)
new_driver.get(link1)
new_driver.find_element_by_css_selector("#applynowbutton").click()
At this point the 'Show how to Apply' button has been clicked.
Unfortunately, I dont know too much about html and all but essentially at this point you can extract the email much like you extracted all the links previously
Try like below:
Can apply scrollIntoView to the particular job option. When it reaches the end, click on Show more option and continue extracting details.
driver.get("https://www.jobbank.gc.ca/jobsearch/jobsearch?sort=D&fsrc=16&fbclid=IwAR2SIG3lbY1S9lO4WilcKw6TxJAJQbFIGYTVE_tOTqYRpb43qM3uYgLWV64")
i = 0
while True:
try:
jobs = driver.find_elements_by_xpath("//div[#class='results-jobs']/article")
driver.execute_script("arguments[0].scrollIntoView(true);",jobs[i])
title = jobs[i].find_element_by_xpath(".//span[#class='noctitle']").text
link = jobs[i].find_element_by_tag_name("a").get_attribute("href")
print(f"{i+1} - {title} : {link}")
i+=1
if i == 100:
break
except IndexError:
driver.find_element_by_id("moreresultbutton").click()
time.sleep(3)

Shell script to download a lot of HTML files and store them statically with all CSS

I have posted on a science forum (roughly 290 questions) that I would like to get back by downloading them with all the associated answers.
The first issue is that I have to be logged on my personal space to have the list of all the messages. How to circumvent this first barrier to be able with a shell script or a single wget command to get back all URL and their content. Can I pass to wgeta login and a password to be logged and redirected to the appropriate URL obtaining the list of all messages?
Once this first issue will be solved, the second issue is that I have to start from 6 different menu pages that all contain the title and the link of the questions.
Moreover, concerning some of my questions, the answers and the discussions may be on multiple pages.
So I wonder if I could achieve this operation of global downloading knowing I would like to store them statically with local CSS stored also on my computer (to keep the same format into my browser when I consult them on my PC).
The URL of the first menu page of questions is (once I am logged on the website : that could be an issue also to download with wget if I am obliged to be connected).
An example of URL containing the list of messages, once I am logged, is:
https://forums.futura-sciences.com/search.php?searchid=22897684
The other pages (there all 6 or 7 pages of discussions title in total appering in the main menu page) have the format:
"https://forums.futura-sciences.com/search.php?searchid=22897684&pp=&page=2" (for page 2).
https://forums.futura-sciences.com/search.php?searchid=22897684&pp=&page=5
(for page 5)
One can see on each of these pages the title and the link of each of the discussions that I would like to download with also CSS (knowing each discussion may contain multiple pages also) :
for example the first page of discussion "https://forums.futura-sciences.com/archives/804364-demonstration-dilatation-temps.html"
has page 2: "https://forums.futura-sciences.com/archives/804364-demonstration-dilatation-temps-2.html"
and page 3: "https://forums.futura-sciences.com/archives/804364-demonstration-dilatation-temps-3.html"
Naively, I tried to do all this with only one command (with the example of URL on my personal space that I have taken at the beginning of post, i.e "https://forums.futura-sciences.com/search.php?searchid=22897684"):
wget -r --no-check-certificate --html-extension --convert-links "https://forums.futura-sciences.com/search.php?searchid=22897684"
but unfortunately, this command downloads all files, and even maybe not what I want, i.e my discussions.
I don't know what the approach to use: must I firstly store all URL in a file (with all sub-pages containing all answers and the global discussion for each of mu initial question)?
And after, I could do maybe a wget -i all_URL_questions.txt. How can I carry out this operation?
Update
My issue needs a script, I tried with Python the following things:
1)
import urllib, urllib2, cookielib
username = 'USERNAME'
password = 'PASSWORD'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
login_data = urllib.urlencode({'username' : username, 'password' : password})
opener.open('https://forums.futura-sciences.com/login.php', login_data)
resp = opener.open('https://forums.futura-sciences.com/search.php?do=finduser&userid=253205&contenttype=vBForum_Post&showposts=1')
print resp.read()
But the page printed is not the page of my home into personal space.
2)
import requests
# Fill in your details here to be posted to the login form.
payload = {
'inUserName': 'USERNAME',
'inUserPass': 'PASSWORD'
}
# Use 'with' to ensure the session context is closed after use.
with requests.Session() as s:
p = s.post('https://forums.futura-sciences.com/login.php?do=login', data=payload)
# print the html returned or something more intelligent to see if it's a successful login page.
print p.text.encode('utf8')
# An authorised request.
r = s.get('https://forums.futura-sciences.com/search.php?do=finduser&userid=253205&contenttype=vBForum_Post&showposts=1')
print r.text.encode('utf8')
Here too, this doesn't work
3)
import requests
import bs4
site_url = 'https://forums.futura-sciences.com/login.php?do=login'
userid = 'USERNAME'
password = 'PASSWWORD'
file_url = 'https://forums.futura-sciences.com/search.php?do=finduser&userid=253205&contenttype=vBForum_Post&showposts=1'
o_file = 'abc.html'
# create session
s = requests.Session()
# GET request. This will generate cookie for you
s.get(site_url)
# login to site.
s.post(site_url, data={'vb_login_username': userid, 'vb_login_password': password})
# Next thing will be to visit URL for file you would like to download.
r = s.get(file_url)
# Download file
with open(o_file, 'wb') as output:
output.write(r.content)
print("requests:: File {o_file} downloaded successfully!")
# Close session once all work done
s.close()
Same thing, the content is wrong
4)
from selenium import webdriver
# To prevent download dialog
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2) # custom location
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', '/tmp')
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv')
webdriver.get('https://forums.futura-sciences.com/')
webdriver.find_element_by_id('ID').send_keys('USERNAME')
webdriver.find_element_by_id ('ID').send_keys('PASSWORD')
webdriver.find_element_by_id('submit').click()
browser = webdriver.Firefox()
browser.get('https://forums.futura-sciences.com/search.php?do=finduser&userid=253205&contenttype=vBForum_Post&showposts=1')
Still not getting to log in with USERNAME and PASSSWORD and get content of homepage of personal space
5)
from selenium import webdriver
from selenium.webdriver.firefox.webdriver import FirefoxProfile
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
def MS_login(username, passwd): # call this with username and password
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['moz:webdriverClick'] = False
driver = webdriver.Firefox(capabilities=firefox_capabilities)
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList", 2) # 0 means to download to the desktop, 1 means to download to the default "Downloads" directory, 2 means to use the directory
fp.set_preference("browser.download.dir","/Users/user/work_archives_futura/")
driver.get('https://forums.futura-sciences.com/') # change the url to your website
time.sleep(5) # wait for redirection and rendering
driver.delete_all_cookies() # clean up the prior login sessions
driver.find_element_by_xpath("//input[#name='vb_login_username']").send_keys(username)
elem = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, "//input[#name='vb_login_password']")))
elem.send_keys(Keys.TAB)
driver.find_element_by_xpath("//input[#type='submit']").click()
print("success !!!!")
driver.close() # close the browser
return driver
if __name__ == '__main__':
MS_login("USERNAME","PASSWORD")
The window is well opened, username filled but impossible to fill or submit the password and click on submit.
PS: the main issue could come from that password field has display:none property, So I can't simulate TAB operation to password field and pass it, once I have put the login.
It seems you're pretty knowledgeable already about scraping using the various methods. All that was missing were the correct field names in the post request.
I used the chrome dev tools (f12 - then go to networking tab). With this open if you login and quickly stop the browser window from redirecting, you'll be able to see the full request to login.php and look at the fields etc.
With that I was able to build this for you. It includes a nice dumping function for responses. To test my code works you can use your real password for positive case and the bad password line for negative case.
import requests
import json
s = requests.Session()
def dumpResponseData(r, fileName):
print(r.status_code)
print(json.dumps(dict(r.headers), indent=1))
cookieDict = s.cookies.get_dict()
print(json.dumps(cookieDict, indent=1))
outfile = open(fileName, mode="w")
outfile.write(r.text)
outfile.close()
username = "your-username"
password = "your-password"
# password = "bad password"
def step1():
data = dict()
data["do"] = "login"
data["vb_login_md5password"] = ""
data["vb_login_md5password_utf"] = ""
data["s"] = ""
data["securitytoken"] = "guest"
data["url"] = "/search.php?do=finduser&userid=1077817&contenttype=vBForum_Post&showposts=1"
data["vb_login_username"] = username
data["vb_login_password"] = password
p = s.post('https://forums.futura-sciences.com/login.php?do=login', data=data)
# Logged In?
if "vbseo_loggedin" in s.cookies.keys():
print("Logged In!")
else:
print("Login Failed :(")
if __name__ == "__main__":
step1()
I don't have any posts in my newly created Futura account so I can't really do any more testing for you - I don't want to spam their forum with garbage.
But I would probably start by doing a request of post search url and scrape the links using bs4.
Then you could probably just use wget -r for each link you've scraped.
#Researcher is correct on their advice when it comes to the requests library. You are not posting all of the request params that the browser would send. Overall, I think it will be difficult to get requests to pull everything when you factor in static content and client side javascript
Your selenium code from section 4 has a few mistakes in it:
# yours
webdriver.find_element_by_id('ID').send_keys('USERNAME')
webdriver.find_element_by_id ('ID').send_keys('PASSWORD')
webdriver.find_element_by_id('submit').click()
# should be
webdriver.find_element_by_id('vb_login_username').send_keys('USERNAME')
webdriver.find_element_by_id('vb_login_password').send_keys('PASSWORD')
webdriver.find_element_by_xpath("//input[#type='submit']").click()
You may need to fiddle with the xpath for the submit button.
Hint: You can debug along the way by taking a screenshots :
webdriver.find_element_by_id('vb_login_username').send_keys('USERNAME')
webdriver.find_element_by_id('vb_login_password').send_keys('PASSWORD')
webdriver.get_screenshot_as_file('before_submit.png')
webdriver.find_element_by_xpath("//input[#type='submit']").click()
webdriver.get_screenshot_as_file('after_submit.png')

Transferring requests.Session() cookies to a selenium web driver in Python

After researching and tinkering, I seem to be stumped as to what I could try. I'm essentially looking to do the reverse of this question right here: Is it possible to "transfer" a session between selenium.webdriver and requests.session
I want to "click" on a JavaScript button on a webpage that I've "reached" through a series of GET/POST requests in a session (it's important that the cookies are maintained and seamlessly transferred since my GET/POST requests are on pages that require a logged-in user).
However, after some googling, I found that requests doesn't seem to offer something like that. I found selenium and have since been trying to properly transfer the cookies over (unsuccessfully).
import requests, requests.utils, lxml.html
from lxml.cssselect import CSSSelector
from selenium import webdriver
# urls which requests will be made to
login_url = 'login-url-here'
logged_in_data_url = 'logged-in-data-here'
# create my Session to contain my cookies
with requests.Session() as s:
login_html = s.get(login_url)
tree = lxml.html.fromstring(login_html.text)
important_key1 = list(set(tree.xpath('//*[#id="fm1"]/div/div[3]/input[1]/#value')))[0]
important_key2 = list(set(tree.xpath('//*[#id="fm1"]/div/div[3]/input[2]/#value')))[0]
form_value = "submit"
login_payload = {
'post-field-1': 'post-data-1',
'post-field-2': 'post-data-2',
'important_key1': 'important_value1',
'important_key2': 'important_value2',
'important_key3': 'important_value3'
}
login_result = s.post(login_url,
data=login_payload,
headers = dict(referer=login_url))
logged_in_data_html = s.get(logged_in_data_url)
tree = lxml.html.fromstring(logged_in_data_html.text)
print(logged_in_data_html.text)
# Attempt at transferring cookies, currently fails
cookie_dict = requests.utils.dict_from_cookiejar(s.cookies)
driver = webdriver.Firefox()
for cookie in cookie_dict:
driver.add_cookie(cookie)
driver.get(logged_in_data_url)
# prints same contents as login_html.text,
# meaning cookie transfer failed and the session was thrown out
print(driver.page_source)
Any advice or pointers on what to do from here?
EDIT: My attempt with selenium-requests:
import seleniumrequests
import lxml.html
from lxml.cssselect import CSSSelector
# urls which requests will be made to
login_url = 'login-url-here'
logged_in_data_url = 'logged-in-data-here'
driver = seleniumrequests.Firefox()
login_html = driver.request('GET', login_url)
tree = lxml.html.fromstring(login_html.text)
important_key1 = list(set(tree.xpath('//*[#id="fm1"]/div/div[3]/input[1]/#value')))[0]
important_key2 = list(set(tree.xpath('//*[#id="fm1"]/div/div[3]/input[2]/#value')))[0]
form_value = "submit"
# following print statements print value1, value2 respec
print ("important_key1 = " + important_key1)
print("important_key2 = " + important_key2)
login_payload = {
'post-field-1': 'post-data-1',
'post-field-2': 'post-data-2',
'important_key1': 'important_value1',
'important_key2': 'important_value2',
'important_key3': 'important_value3'
}
login_result = driver.request('POST', login_url,
data=login_payload,
headers = dict(referer=login_url))
# this should print out the landing page after being logged in
# source code contains important_key1, 2, and 3 with different values
# the GET and POST requests seem to be in different sessions
# how do I fix that?
print(login_result.text)
I don't believe it is possible to do that natively. There is, however, an extension to Selenium called selenium-requests that you should be able to use.
EDIT:
Try adding the following to your code. Upon reading the source, this should work (and use the requests Session auto-initialized during the POST request.
response = driver.request('GET', logged_in_data_url)

python urllib post question

im making some simple python post script but it not working well.
there is 2 part to have to login.
first login is using 'http://mybuddy.buddybuddy.co.kr/userinfo/UserInfo.asp' this one.
and second login is using 'http://user.buddybuddy.co.kr/usercheck/UserCheckPWExec.asp'
i can login first login page, but i couldn't login second page website.
and return some error 'illegal access' such like .
i heard this is related with some cooke but i don't know how to implement to resolve this problem.
if anyone can help me much appreciated!! Thanks!
import re,sys,os,mechanize,urllib,time
import datetime,socket
params = urllib.urlencode({'ID':'ph896011', 'PWD':'pk1089' })
rq = mechanize.Request("http://mybuddy.buddybuddy.co.kr/userinfo/UserInfo.asp", params)
rs = mechanize.urlopen(rq)
data = rs.read()
logged_fail = r';history.back();</script>' in data
if not logged_fail:
print 'login success'
try:
params = urllib.urlencode({'PASSWORD':'pk1089'})
rq = mechanize.Request("http://user.buddybuddy.co.kr/usercheck/UserCheckPWExec.asp", params )
rs = mechanize.urlopen(rq)
data = rs.read()
print data
except:
print 'error'
You can't use selenium? IMHO it's better do automation with this.
For install utilize:
pip install selenium
A example:
from selenium import webdriver
browser = webdriver.Firefox()
# open site
browser.get('http://google.com.br')
# get page source
browser.page_source
A login example:
# different methods to get a html item
form = browser.find_element_by_tag_name('form')
username = browser.find_element_by_id('input_username')
password = browser.find_element_by_css_selector('input[type=password]')
username.send_keys('myUser')
password.send_keys('myPass')
form.submit()

Fill form values in a web page via a Python script (not testing)

I need to fill form values on a target page then click a button via Python. I've looked at Selenium and Windmill, but these are testing frameworks - I'm not testing. I'm trying to log into a 3rd party website programatically, then download and parse a file we need to insert into our database. The problem with the testing frameworks is that they launch instances of browsers; I just want a script I can schedule to run daily to retrieve the page I want. Any way to do this?
You are looking for Mechanize
Form submitting sample:
import re
from mechanize import Browser
br = Browser()
br.open("http://www.example.com/")
br.select_form(name="order")
# Browser passes through unknown attributes (including methods)
# to the selected HTMLForm (from ClientForm).
br["cheeses"] = ["mozzarella", "caerphilly"] # (the method here is __setitem__)
response = br.submit() # submit current form
Have a look on this example which use Mechanize: it will give the basic idea:
#!/usr/bin/python
import re
from mechanize import Browser
br = Browser()
# Ignore robots.txt
br.set_handle_robots( False )
# Google demands a user-agent that isn't a robot
br.addheaders = [('User-agent', 'Firefox')]
# Retrieve the Google home page, saving the response
br.open( "http://google.com" )
# Select the search box and search for 'foo'
br.select_form( 'f' )
br.form[ 'q' ] = 'foo'
# Get the search results
br.submit()
# Find the link to foofighters.com; why did we run a search?
resp = None
for link in br.links():
siteMatch = re.compile( 'www.foofighters.com' ).search( link.url )
if siteMatch:
resp = br.follow_link( link )
break
# Print the site
content = resp.get_data()
print content
You can use the standard urllib library to do this like so:
import urllib
urllib.urlretrieve("http://www.google.com/", "somefile.html", lambda x,y,z:0, urllib.urlencode({"username": "xxx", "password": "pass"}))
The Mechanize example as suggested seems to work. In input fields where you must enter text, use something like:
br["kw"] = "rowling" # (the method here is __setitem__)
If some content is generated after you submit the form, as in a search engine, you get it via:
print response.read()
For checkboxes, use 1 & 0 as true & false respectively:
br["checkboxname"] = 1 #checked = true
br["checkboxname2"] = 0 #checked = false

Categories

Resources