How to crawl ASP.NET web applications - python

I'm writing an application which needs to communicate with a ASP.NET website that doesn't have an API.
I'm using Python 3.5.2 with Requests 2.18.4 to achieve my purpose.
The problem is the site is using _dopostback() so I achieved my goal using Selenium but it was slow and it opened a browser for every request that I wanted to make.
First I analyzed the POST data sent to the website for login using BurpSuite. The contained some hidden fields from the webpage HTML and the username, password, captcha and a radio button.
A successful login attempt will return 302 HTTP response code and redirect the browser to the profile page.
I extract the hidden fields and get the captcha image and enter it manually myself and POST the data to the website but it returns 200 response telling an error occurred.
This is the Code:
import re
import requests
from urllib.request import urlretrieve
from collections import OrderedDict
from bs4 import BeautifulSoup as BS
from PIL import Image
def get_and_show_captcha(session):
soup = BS(current_response.text, "html.parser")
image_url = soup.find("img", {"id": 'RadCaptcha1_CaptchaImageUP'})["src"]
Image.open(urlretrieve(base_url + image_url)[0]).show()
base_url = 'http://example.com/'
section_login = 'login.aspx'
proxies = {
'http': 'http://127.0.0.1:8080',
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
'DNT': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests': '1',
}
session = requests.Session()
session.headers.update(headers)
r = session.get('http://example.com/login.aspx', proxies=proxies)
login_post_dict = OrderedDict()
login_post_dict['__EVENTTARGET'] = ''
login_post_dict['__EVENTARGUMENT'] = ''
login_post_dict['__VIEWSTATE'] = ''
login_post_dict['__VIEWSTATEGENERATOR'] = ''
login_post_dict['__PREVIOUSPAGE'] = ''
login_post_dict['__EVENTVALIDATION'] = ''
login_post_dict['txtUserName'] = 'USERNAME'
login_post_dict['txtPassword'] = 'PASSWORD'
get_and_show_captcha(session=session)
capthca = input('CAPTCHA: ')
login_post_dict['rcTextBox1'] = capthca
login_post_dict['RadCaptcha1_ClientState'] = ''
login_post_dict['SOMERADIO'] = 'RADIOBUTTON'
login_post_dict['btn_enter'] = 'ENTER'
login_post_dict['txtUserName2'] = ''
login_post_dict['txt_email'] = ''
soup = BS(r.text, 'html.parser')
capture_id = re.compile("id=\"(\w*)\"")
capture_value = re.compile("value=\"(.*)\"")
for html_input in soup.find_all('input', attrs={'type': 'hidden'}):
for hidden_id in capture_id.findall(str(html_input)):
if hidden_id != r"RadCaptcha1_ClientState":
login_post_dict[hidden_id] = ''
for hidden_id_value in capture_value.findall(str(html_input)):
if hidden_id_value is '':
continue
login_post_dict[hidden_id] = hidden_id_value
session.headers.update({'Referer': base_url + section_login})
print(login_post_dict)
session.post(base_url + section_login, data=login_post_dict, proxies=proxies)
I send the script data through BurpSuit in order to see whats being sent exactly.
Any solution?

Related

Can't Scrape Dynamically Loaded HTML Table in an Aspx Website

I am trying to scrape some data from the Arizona Medical Board. I search for Anesthesiology in the specialty dropdown list and I find that the table (with the links to the profiles I want to scrape) are dynamically loaded into the website. I notice when hitting the 'specialty search' button, a POST request is made to the server and the html table is actually returned from the server. I have tried simulating this post request to see if I get receive this html table and then try to parse it with bs4. Is this possible, and if so, am I even on the right track?
I have tried to included the form data I found in the network tab of the developer tools but I am not sure if this is the right data, or if I am forgetting some data here or in the header.
Please let me know if I need to clarify, I understand this may not be worded the best. Thank you!
import requests
# import re
import formdata
session = requests.Session()
url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/public/WebVerificationSearch.aspx?q=azmd&t=20220622123512"
headers = {'User-Agent': 'My-Agent-Placeholder'}
res = session.get(url, headers=headers)
print("Response: {}".format(res))
payload = {
"__VIEWSTATE": formdata.state,
"__VIEWSTATEGENERATOR": formdata.generator,
"__EVENTVALIDATION" : formdata.validation,
"ctl00%24ContentPlaceHolder1%24Name": 'rbName1',
"ctl00%24ContentPlaceHolder1%24Name": "rbName1",
"ctl00%24ContentPlaceHolder1%24txtLastName" : '',
"ctl00%24ContentPlaceHolder1%24txtFirstName" : '',
"ctl00%24ContentPlaceHolder1%24License": "rbLicense1",
"ctl00%24ContentPlaceHolder1%24txtLicNum": '',
"ctl00%24ContentPlaceHolder1%24Specialty": "rbSpecialty1",
"ctl00%24ContentPlaceHolder1%24ddlSpecialty": '12155',
"ctl00%24ContentPlaceHolder1%24ddlCounty": '15910',
"ctl00%24ContentPlaceHolder1%24txtCity": '',
"__EVENTTARGET": "ctl00%24ContentPlaceHolder1%24btnSpecial",
"__EVENTARGUMENT": ''
}
# params = {"q": "azmd",
# "t": "20220622123512"}
# #url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/Public/Results.aspx"
res = session.post(url, data=payload, headers=headers)
print("Post response: {}".format(res))
print(res.text)
# res = requests.get('https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/Public/Results.aspx', headers=headers)
Try:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0"
}
url = "https://azbomprod.azmd.gov/GLSuiteWeb/Clients/AZBOM/public/WebVerificationSearch.aspx?q=azmd&t=20220622082816"
with requests.session() as s:
soup = BeautifulSoup(s.get(url, headers=headers).content, "html.parser")
data = {}
for inp in soup.select("input"):
data[inp.get("name")] = inp.get("value", "")
data["ctl00$ContentPlaceHolder1$Name"] = "rbName1"
data["ctl00$ContentPlaceHolder1$License"] = "rbLicense1"
data["ctl00$ContentPlaceHolder1$Specialty"] = "rbSpecialty1"
data["ctl00$ContentPlaceHolder1$ddlSpecialty"] = "12155"
data["ctl00$ContentPlaceHolder1$ddlCounty"] = "15910"
data["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$btnSpecial"
data["__EVENTARGUMENT"] = ""
soup = BeautifulSoup(
s.post(url, data=data, headers=headers).content, "html.parser"
)
for row in soup.select("tr:has(a)"):
name = row.select("td")[-1].text
link = row.a["href"]
print("{:<35} {}".format(name, link))
Prints:
Abad-Pelsang, Elma A. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1620623&licID=121089&licType=1
Abadi, Bilal Ibrahim https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1755530&licID=525771&licType=1
Abbasian, Mohammad https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1635449&licID=492537&licType=1
Abdel-Al, Naglaa Z. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1637612&licID=175204&licType=1
Abedi, Babak https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1641219&licID=169009&licType=1
Abel, Martin D. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1624271&licID=510929&licType=1
Abenstein, John P. https://azbomprod.azmd.gov/glsuiteweb/clients/azbom/Public/Profile.aspx?entID=1622930&licID=502482&licType=1
...and so on.

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

I cannot autologin to pastebin using requests + BeautifulSoup

I am trying to auto-login to pastebin account using python, but im failing and i don't know why. I copied the request headers exactly and double checked... but still i am greeted with 400 HTTP code. Can somebody help me?
This is my code:
import requests
from bs4 import BeautifulSoup
import subprocess
import os
import sys
from requests import Session
# the actual program
page = requests.get("https://pastebin.com/99qQTecB")
parse = BeautifulSoup(page.content, 'html.parser')
string = parse.find("textarea")
command = 'hello'
###########################################################
URL = 'https://pastebin.com'
LOGIN_ROUTE ='/login'
d = requests.session()
cfduid = d.get(URL).cookies['__cfduid']
e = requests.session()
csrf = e.get(URL).cookies['_csrf-frontend']
f = requests.session()
pastebin = f.get(URL).cookies['pastebin-frontend']
print (csrf)
print(cfduid)
print(pastebin)
HEADERS = {'Host':'pastebin.com', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip, deflate, br' , 'Referer': 'https://pastebin.com/login', 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': '174', 'Connection': 'keep-alive', 'Cookie': "__cfduid=" + cfduid + ";", '_csrf-frontend':csrf + ";"
,'pastebin-frontend':pastebin + ";" ,'Upgrade-Insecure-Requests': '1'}
if command in string:
/super_mario_bros.exe', shell=True)
s = requests.session()
csrf_token = s.get(URL).cookies['_csrf-frontend']
login_payload = {
'LoginForm[username]': 'SECRET',
'LoginForm[password]': 'Secret',
'_csrf-frontend': csrf_token
}
#Actual Login
login_req = s.post(URL + LOGIN_ROUTE, data=login_payload)
print(csrf_token)
print(login_req.status_code)
else:
print("smth")
#print(string)
#Cookie: __cfduid=d955068eb3316226875aa037c059fd8f11595841495; __gads=ID=49ab9fcccba85989:T=1595841504:S=ALNI_MYT-PJZzkGrbYunHFHQE-EEw3vfhQ; _ga=GA1.2.1839432134.1595810341; pastebin-frontend=a9bf2e8c462237148c2d5f6b0832387c; _csrf-frontend=8213fdd4c42f9cfad45ed4993a25a1753c3539b1700c9e92be8ffac00780e34ea%3A2%3A%7Bi%3A0%3Bs%3A14%3A%22_csrf-frontend%22%3Bi%3A1%3Bs%3A32%3A%22HHm8aNkQL8XAG8joV8KfCZqOTls50yyW%22%3B%7D; _gid=GA1.2.1476917794.1596212111
I am really exhausted cause for the last couple of hours ive been trying to make this work, but still 400 Code is there. Thank you in advance!
I can recommend browser_cookie3:
import browser_cookie3
import requests
from bs4 import BeautifulSoup
cj = browser_cookie3.load()
s = requests.Session()
for c in cj:
if 'pastebin' in str(c):
s.cookies.set_cookie(c)
r = s.get('https://pastebin.com/')
soup = BeautifulSoup(r.content, 'lxml')
print(soup.select_one('div.header__user-name').text)
prints:
UWTD
This code take the session cookies from the browser and use them. And prints the username of the user.

Python Translation to .NET C#

I want help translating finding the equivalent code for the C# version.
Mainly, I don't understand the BeautifulSoup and html.parser the most.
Which libraries would I use in C# I ask for demonstration of code example. Thanks for all the help.
import urllib.parse
import time
from bs4 import BeautifulSoup
def check_acc(user,password):
POST = 'POST'
GET = 'GET'
data = "userLoginId={}&password={}&STAYIN=false&flow=websiteSignUp&mode=login&action=loginAction&withFields=STAYIN%2CnextPage%2CuserLoginId%2Cpassword%2CcountryCode%2CcountryIsoCode&authURL=authURL%2FEcpWPKhtag%3D&nextPage=&showPassword=&countryCode=%2B33&countryIsoCode=USA".format(email, password).encode("utf-8")
request_login = urllib.request.Request("https://www.mywebsitetest:8080/Admin",
data = data,
headers ={
'Host': 'www.mywebsitetest.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr-FR',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.mywebsitetest/USA/Login',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': 330},
unverifiable = True,
method = 'POST')
resfused_msg = "Wrong combination</a>.".format('<a href="/">')
try:
response = urllib.request.urlopen(request_login)
readed = response.read()
soup = BeautifulSoup(readed, "html.parser")
IsTrue = str(soup.findAll("div", attrs={"class" : "ui-message-contents"}).text)
if IsTrue == resfused_msg:
print("Refused => ({}:{})".format(user, password))
else:
print("Accepted => ({}:{})".format(user, password))
result_file = open(result, "a")
result_file.write(user + ":" + password)
except Exception as e:
print(f"[-] Sorry an error as occured.\n=> {e}\n")
print("[+]Closing program.\n")
time.sleep(2)
exit()
combo = "working.txt"
try:
combo_file = open(combo, "r")
line = list(combo)
print("[+]Combolist loaded.\n")
for line in combo_file:
user = line.split(":")[0]
password = line.split(":")[1]
check_acc(user,password)
except FileNotFoundError:
print("[-]NOthing found.")
print("[+]Bye!...\n")
time.sleep(20)
exit()```
The Html Agility Pack is one of the most common HTML parsing packages used in C#. It is available via direct download or from the NuGet package feed. The documentation is available here. You should also get the CssSelectors nuget package to allow you to use CSS Selectors to extract elements from the DOM.
To get your div with the ui-message-contents class you will do something like:
// You will need to pass in/configure the headers and other fields for your request before you call GetResponse
HttpWebRequest request= (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.Load(response.GetResponseStream());
IList<HtmlNode> elements = document.QuerySelectorAll("div.ui-message-contents");
// or if you're only expecting a single result
HtmlNode element = document.QuerySelector("div.ui-message-contents");

Submitting form using mechanize in python

I am trying to submit a form to http://apps.fas.usda.gov/esrquery/esrq.aspx in python, using the following code:
import urllib
from bs4 import BeautifulSoup
import mechanize
import datetime
today = datetime.date.today().strftime("%m/%d/%Y")
url = 'http://apps.fas.usda.gov/esrquery/esrq.aspx'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
viewstate = soup.find('input', {'id' : '__VIEWSTATE'})['value']
eventval = soup.find('input', {'id' : '__EVENTVALIDATION'})['value']
br = mechanize.Browser(factory=mechanize.RobustFactory())
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open(url)
# fill form
br.select_form("aspnetForm")
br.form.set_all_readonly(False)
br.form['__EVENTTARGET'] = ''
br.form['__EVENTARGUMENT'] = ''
br.form['__LASTFOCUS'] = ''
br.form['__VIEWSTATE'] = viewstate
br.form['__VIEWSTATEGENERATOR'] = '41AA5B91'
br.form['__EVENTVALIDATION'] = eventval
br.form['ctl00$MainContent$lbCommodity'] = ['401']
br.form['ctl00$MainContent$lbCountry'] = ['0:0']
br.form['ctl00$MainContent$ddlReportFormat'] = ['10']
br.find_control('ctl00$MainContent$cbxSumGrand').items[0].selected = True
br.find_control('ctl00$MainContent$cbxSumKnown').items[0].selected = False
br.form['ctl00$MainContent$rblOutputType'] = ['2']
br.form['ctl00$MainContent$tbStartDate'] = '01/01/1999'
br.form['ctl00$MainContent$ibtnStart'] = ''
br.form['ctl00$MainContent$tbEndDate'] = today
br.form['ctl00$MainContent$ibtnEnd'] = ''
br.form['ctl00$MainContent$rblColumnSelection'] = ['regular']
response = br.submit()
The response I am getting is essentially just the html code of the site with the form filled out as expected. However, I was expecting an excel file (as I have selected OutputType value of 2)
I think I am missing something on the submission front. Could somebody shed some light on what I am missing?
You're close, but there's more you need to do after submitting. In this case, just add:
doc = response.read()
ofile = '<your path>'
with open(ofile, 'w') as f:
f.write(doc)
I couldn't actually test this on your website at the moment, so I'm just assuming all your settings before this are correct. I only have Python 3 at work, and mechanize only works on 2.x. Regardless, this is generally the way you want to retrieve this sort of output.

Categories

Resources