I am trying to submit a form to http://apps.fas.usda.gov/esrquery/esrq.aspx in python, using the following code:
import urllib
from bs4 import BeautifulSoup
import mechanize
import datetime
today = datetime.date.today().strftime("%m/%d/%Y")
url = 'http://apps.fas.usda.gov/esrquery/esrq.aspx'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
viewstate = soup.find('input', {'id' : '__VIEWSTATE'})['value']
eventval = soup.find('input', {'id' : '__EVENTVALIDATION'})['value']
br = mechanize.Browser(factory=mechanize.RobustFactory())
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open(url)
# fill form
br.select_form("aspnetForm")
br.form.set_all_readonly(False)
br.form['__EVENTTARGET'] = ''
br.form['__EVENTARGUMENT'] = ''
br.form['__LASTFOCUS'] = ''
br.form['__VIEWSTATE'] = viewstate
br.form['__VIEWSTATEGENERATOR'] = '41AA5B91'
br.form['__EVENTVALIDATION'] = eventval
br.form['ctl00$MainContent$lbCommodity'] = ['401']
br.form['ctl00$MainContent$lbCountry'] = ['0:0']
br.form['ctl00$MainContent$ddlReportFormat'] = ['10']
br.find_control('ctl00$MainContent$cbxSumGrand').items[0].selected = True
br.find_control('ctl00$MainContent$cbxSumKnown').items[0].selected = False
br.form['ctl00$MainContent$rblOutputType'] = ['2']
br.form['ctl00$MainContent$tbStartDate'] = '01/01/1999'
br.form['ctl00$MainContent$ibtnStart'] = ''
br.form['ctl00$MainContent$tbEndDate'] = today
br.form['ctl00$MainContent$ibtnEnd'] = ''
br.form['ctl00$MainContent$rblColumnSelection'] = ['regular']
response = br.submit()
The response I am getting is essentially just the html code of the site with the form filled out as expected. However, I was expecting an excel file (as I have selected OutputType value of 2)
I think I am missing something on the submission front. Could somebody shed some light on what I am missing?
You're close, but there's more you need to do after submitting. In this case, just add:
doc = response.read()
ofile = '<your path>'
with open(ofile, 'w') as f:
f.write(doc)
I couldn't actually test this on your website at the moment, so I'm just assuming all your settings before this are correct. I only have Python 3 at work, and mechanize only works on 2.x. Regardless, this is generally the way you want to retrieve this sort of output.
Related
I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.
Here goes the manual steps to reach the content:
Choose the first item from dropdown.
Get the links to the detail page.
Grab these two fields from detail page.
While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.
Script without session:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def grab_first_link_from_dropdown(link):
r = requests.get(link,headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link,headers=headers)
str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
soup = BeautifulSoup(res.text,"html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'",target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield str_cookie,inner_link
def get_content(str_cookie,inner_link):
headers['Cookie'] = str_cookie
res = requests.get(inner_link,headers=headers)
soup = BeautifulSoup(res.text,"html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError: descripcion = ""
return expediente,descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie,detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie,detail_page_link))
What possible change should I bring about to make the script work?
There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.
I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:
import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'
headers = {
'User-Agent': "Scraping Your Vigentes 1.0",
}
def grab_first_link_from_dropdown(link):
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")
category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
return category_link
def fetch_detail_page_link(cat_link):
res = requests.get(cat_link, headers=headers)
cookies = RequestsCookieJar() # create empty cookie jar
for r in res.history:
cookies.update(r.cookies) # merge in cookies from each redirect response
cookies.update(res.cookies) # merge in cookies from the final response
soup = BeautifulSoup(res.text, "html.parser")
for items in soup.select("table.list-table > tbody.list-tbody > tr"):
target_link = items.select_one("a.detailLink").get("onclick")
detail_num = re.findall(r"goToDetail\(\'(\d+?)\'", target_link)[0]
inner_link = vigen_detail_page.format(detail_num)
yield cookies, inner_link
def get_content(cookies, inner_link):
res = requests.get(inner_link, headers=headers, cookies=cookies)
if not res.ok:
print("Got bad response %s :(" % res.status_code)
return "", ""
soup = BeautifulSoup(res.text, "html.parser")
try:
expediente = soup.select_one(".form_question:contains('Código del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
expediente = ""
try:
descripcion = soup.select_one(".form_question:contains('Descripción del Expediente') + .form_answer").get_text(strip=True)
except AttributeError:
descripcion = ""
return expediente, descripcion
if __name__ == '__main__':
category_link = grab_first_link_from_dropdown(link)
for cookie, detail_page_link in fetch_detail_page_link(category_link):
print(get_content(cookie, detail_page_link))
I'm writing an application which needs to communicate with a ASP.NET website that doesn't have an API.
I'm using Python 3.5.2 with Requests 2.18.4 to achieve my purpose.
The problem is the site is using _dopostback() so I achieved my goal using Selenium but it was slow and it opened a browser for every request that I wanted to make.
First I analyzed the POST data sent to the website for login using BurpSuite. The contained some hidden fields from the webpage HTML and the username, password, captcha and a radio button.
A successful login attempt will return 302 HTTP response code and redirect the browser to the profile page.
I extract the hidden fields and get the captcha image and enter it manually myself and POST the data to the website but it returns 200 response telling an error occurred.
This is the Code:
import re
import requests
from urllib.request import urlretrieve
from collections import OrderedDict
from bs4 import BeautifulSoup as BS
from PIL import Image
def get_and_show_captcha(session):
soup = BS(current_response.text, "html.parser")
image_url = soup.find("img", {"id": 'RadCaptcha1_CaptchaImageUP'})["src"]
Image.open(urlretrieve(base_url + image_url)[0]).show()
base_url = 'http://example.com/'
section_login = 'login.aspx'
proxies = {
'http': 'http://127.0.0.1:8080',
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
'DNT': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests': '1',
}
session = requests.Session()
session.headers.update(headers)
r = session.get('http://example.com/login.aspx', proxies=proxies)
login_post_dict = OrderedDict()
login_post_dict['__EVENTTARGET'] = ''
login_post_dict['__EVENTARGUMENT'] = ''
login_post_dict['__VIEWSTATE'] = ''
login_post_dict['__VIEWSTATEGENERATOR'] = ''
login_post_dict['__PREVIOUSPAGE'] = ''
login_post_dict['__EVENTVALIDATION'] = ''
login_post_dict['txtUserName'] = 'USERNAME'
login_post_dict['txtPassword'] = 'PASSWORD'
get_and_show_captcha(session=session)
capthca = input('CAPTCHA: ')
login_post_dict['rcTextBox1'] = capthca
login_post_dict['RadCaptcha1_ClientState'] = ''
login_post_dict['SOMERADIO'] = 'RADIOBUTTON'
login_post_dict['btn_enter'] = 'ENTER'
login_post_dict['txtUserName2'] = ''
login_post_dict['txt_email'] = ''
soup = BS(r.text, 'html.parser')
capture_id = re.compile("id=\"(\w*)\"")
capture_value = re.compile("value=\"(.*)\"")
for html_input in soup.find_all('input', attrs={'type': 'hidden'}):
for hidden_id in capture_id.findall(str(html_input)):
if hidden_id != r"RadCaptcha1_ClientState":
login_post_dict[hidden_id] = ''
for hidden_id_value in capture_value.findall(str(html_input)):
if hidden_id_value is '':
continue
login_post_dict[hidden_id] = hidden_id_value
session.headers.update({'Referer': base_url + section_login})
print(login_post_dict)
session.post(base_url + section_login, data=login_post_dict, proxies=proxies)
I send the script data through BurpSuit in order to see whats being sent exactly.
Any solution?
I am trying to scrape some emails from mdpi.com, emails available only to logged in users. But it fails when I am trying to do so. I am getting
when logged out:
Code itself:
import requests
from bs4 import BeautifulSoup
import traceback
login_data = {'form[email]': 'xxxxxxx#gmail.com', 'form[password]': 'xxxxxxxxx', 'remember': 1,}
base_url = 'http://www.mdpi.com'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0'}
session = requests.Session()
session.headers = headers
# log_in
s = session.post('https://susy.mdpi.com/user/login', data=login_data)
print(s.text)
print(session.cookies)
def make_soup(url):
try:
r = session.get(url)
soup = BeautifulSoup(r.content, 'lxml')
return soup
except:
traceback.print_exc()
return None
example_link = 'http://www.mdpi.com/search?journal=medsci&year_from=1996&year_to=2017&page_count=200&sort=relevance&view=default'
def article_finder(soup):
one_page_articles_divs = soup.find_all('div', class_='article-content')
for article_div in one_page_articles_divs:
a_link = article_div.find('a', class_='title-link')
link = base_url + a_link.get('href')
print(link)
article_soup = make_soup(link)
grab_author_info(article_soup)
def grab_author_info(article_soup):
# title of the article
article_title = article_soup.find('h1', class_="title").text
print(article_title)
# affiliation
affiliations_div = article_soup.find('div', class_='art-affiliations')
affiliation_dict = {}
aff_indexes = affiliations_div.find_all('div', class_='affiliation-item')
aff_values = affiliations_div.find_all('div', class_='affiliation-name')
for i, index in enumerate(aff_indexes): # 0, 1
affiliation_dict[int(index.text)] = aff_values[i].text
# authors names
authors_div = article_soup.find('div', class_='art-authors')
authors_spans = authors_div.find_all('span', class_='inlineblock')
for span in authors_spans:
name_and_email = span.find_all('a') # name and email
name = name_and_email[0].text
# email
email = name_and_email[1].get('href')[7:]
# affiliation_index
affiliation_index = span.find('sup').text
indexes = set()
if len(affiliation_index) > 2:
for i in affiliation_index.strip():
try:
ind = int(i)
indexes.add(ind)
except ValueError:
pass
print(name)
for index in indexes:
print('affiliation =>', affiliation_dict[index])
print('email: {}'.format(email))
if __name__ == '__main__':
article_finder(make_soup(example_link))
What should I do in order to get what I want?
Ah that is easy, you haven't managed to log in correctly. If you look at the response from your initial call you will see that you are returned the login page HTML instead of the my profile page. The reason for this is that you are not submitted the hidden token on the form.
The solution request the login page, and then use either lxml or BeautifulSoup to parse the hidden input 'form[_token]'. Get that value and then add it to your login_data payload.
Then submit your login request and you'll be in.
I am using mechanize & BeautifulSoup to scrape a html web page which is essentially an html webquery holding over 100000 rows of data. My code below is able to scrape data but only part of it - 24998 to be precise
import mechanize, urllib2, cookielib
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
cj = cookielib.CookieJar()
br = mechanize.Browser()
br.set_cookiejar(cj)
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open("https://www.google.com/doubleclick/search/reports/download?ay=20700000000000476&av=0&rid=28047&of=webqueryphtml")
br.select_form(nr=0)
br.form['Email'] = 'xxxxx#gmail.com'
br.form['Passwd'] = 'xxxxx'
br.submit()
soup = BeautifulSoup(br.response().read())
file = open('webquery_out.txt','w')
for row in soup.findAll('tr')[1:]:
col = row.findAll('td')
Row_Type = col[0].get_text(' ', True)
Status = col[1].get_text(' ', True)
Conv_ID = col[2].get_text(' ', True)
Ad_Conv_ID = col[3].get_text(' ', True)
Conv_Rev = col[4].get_text(' ', True)
Conv_Org_Rev = col[5].get_text(' ', True)
Engine = col[6].get_text(' ', True)
Adv_ID = col[7].get_text(' ', True)
record = (Row_Type,Status,Conv_ID,Ad_Conv_ID,Conv_Rev,Conv_Org_Rev,Engine,Adv_ID)
line = "|".join(record)
file.write(line + '\n')
I do not know if it is the user agent or the write that is causing the partial result. Would appreciate any help
I'm trying to pull a specific header from a website with python script. If I can get this working, I'll actually be pulling image names, but figured I would start with something simple like the header. I can pull the header name and save the date to a csv file, but the header name doesn't print. To clarify, each time I save, the date saves to the file, but the header name doesn't.
Here is my python script:
import urllib2
from bs4 import BeautifulSoup
import csv
import time
import os
def get_html():
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
)
opener.add_handler = [
('User-agent',
"Mozilla/4.0 (compatible; MSIE 7.0; "
"Windows NT 5.1; .NET CLR 2.0.50727; "
".NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)")
]
url = "http://www.photosbywagner.com/galleryone.html"
response = opener.open(url)
return ''.join(response.readlines())
def get_html_sections(html):
soup = BeautifulSoup(html)
html_section = soup.find('div',
attrs={'id': 'headcontainer'})
return html_section
def parse_section_html(html):
selected_html = get_html_sections(html)
result = {}
# <img />
result['selection'] = selected_html.find('h1').contents[0]
return result
field_order = ['date', 'info']
fields = {'date' : 'Date',
'info' : 'Info'}
def write_row(selected_html):
file_name = "WrittenNames" + ".csv"
if os.access(file_name, os.F_OK):
file_mode = 'ab'
else:
file_mode = 'wb'
csv_writer = csv.DictWriter(
open(file_name, file_mode),
fieldnames=field_order,
extrasaction='ignore',
)
if file_mode == 'wb':
csv_writer.writerow(fields)
csv_writer.writerow(selected_html)
if __name__ == '__main__':
html = get_html()
row_to_write = parse_section_html(html)
row_to_write['date'] = time.strftime("%Y-%m-%d %H:%M")
write_row(row_to_write)
print row_to_write
Here is the webpage:
http://www.photosbywagner.com/galleryone.html
the section I'm pulling is:
<div id="headcontainer"><h1>Ed Wagner Photo Gallery:</h1></div>
I'm not sure what I'm missing here, since everything else seems to check out - it pulls the right info and prints it to the screen, it just won't save to the CSV.
Thanks.
In parse_section_html, should:
result['selection'] = selected_html.find('h1').contents[0]
be:
result['info'] = selected_html.find('h1').contents[0]
to match your fields definition?