Python3 the same url,using urllib.request failed but requests succeed? - python

l feel puzzled.
My idea is that I want to send request to the url, and then extract the POST data in the web page, and then sent to the web page.When l used the urllib.request in python,l failed,but instead that l used the requests,it works!
Please tell me why....
Here is the code,and the annotation is code which l used urllib.request
import urllib.request
import http.cookiejar
import re
import requests
loginUrl='https://passport.csdn.net/account/login?from=http://my.csdn.net/my/mycsdn'
#Here is the urllib.request code
#cookies=http.cookiejar.MozillaCookieJar()
#handler=urllib.request.HTTPCookieProcessor(cookies)
#opener=urllib.request.build_opener(handler)
headers={
'Origin': 'http://passport.csdn.net',
'Referer':'http://passport.csdn.net/account/login?from=http%3A%2F%2Fmy.csdn.net%2Fmy%2Fmycsdn',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36 LBBROWSER'
}
#Here is the requests code
s = requests.Session()
data = s.get(loginUrl)
data = data.text
#request = urllib.request.Request(loginUrl)
#response = urllib.request.urlopen(request)
#data = response.read().decode('utf-8')
#l get the value of lt and execution from the web page
pattern_lt = re.compile('<input type="hidden" name="lt" value="(.*?)" />',re.S)
lt = re.findall(pattern_lt,data)
lt = lt[0]
pattern_exe = re.compile('<input type="hidden" name="execution" value="(.*?)" />',re.S)
exe = re.findall(pattern_exe,data)
exe = exe[0]
postDict = {
'username':'qinyufeng_hdq#163.com',
'password':'csdn690076598',
'lt':lt,
'execution':exe,
'_eventId':'submit'
}
r = s.post(loginUrl, data=postDict)
#postData = urllib.parse.urlencode(postDict).encode()
#request = urllib.request.Request(loginUrl, postData,headers)
#response = opener.open(request)
#data = response.read().decode('UTF-8')
print (r.text)
l'm not good at English and l hope you get my idea and thank you for reading my problem.

Related

How to bypass AKAMAI bot detection for data scraping using requests_html, Beautiful Soup

I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))

Webscraping NSE Option Chain data in Python

In this code I'm trying to fetch NSE option chain data via Python code.
Tool - Spyder4
Python - 3.7
CODE IS NOT THROWING ANY ERROR ,I don't know what I'm doing wrong.
PRINT 1 is giving my proper output as JSON data but PRINT 2 & PRINT 3 is not showing any output.
Can someone please help me in debugging this code.
import requests
import json
import pandas as pd
import xlwings as xw
from df2gspread import df2gspread as d2g
import gspread
from oauth2client.service_account import ServiceAccountCredentials
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 75)
pd.set_option('display.max_row', 2500)
url = "https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY"
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36',
"accept-language": "en-US,en;q=0.9,hi;q=0.8","accept-encoding": "gzip, deflate, br"}
cookie_dict = {'bm_sv' : 'AA02590AB18B4FC4A036CC62F5230694~8py6nqGfKvu3P4aKZoNpf4HZOUYQJ4i6JMyPMX14ksLZYE+0HlglIA3S2AAa9JGJPvXrBHcJ7uS2ZMcMq3f+FZ/ttHuqFzuAmMf1ZnI9hFgpqB7USISOoa3NfzMufwVAd0U7MgeSxF7+GjuyOuApyOQcoHmyr53hB4JLSqd0U1s'}
session = requests.session()
for cookie in cookie_dict:
session.cookies.set(cookie,cookie_dict[cookie])
expiry = '16-Jul-2020'
def fetch_oi():
r = session.get(url, headers=headers).json()
#print(r) PRINT 1 - THIS PRINT IS WORKING
if expiry:
ce_values = [data['CE'] for data in r ['records']['data'] if "CE" in data and str(data['expiryDate'].lower() == str(expiry).lower())]
pe_values = [data['PE'] for data in r ['records']['data'] if "PE" in data and str(data['expiryDate'].lower() == str(expiry).lower())]
else:
ce_values = [data['CE'] for data in r ['filtered']['data'] if "CE" in data]
pe_values = [data['PE'] for data in r ['filtered']['data'] if "PE" in data]
print(ce_values) # PRINT 2 NO OUTPUT NO ERROR
ce_data = pd.DataFrame(ce_values)
pe_data = pd.DataFrame(pe_values)
ce_data = ce_data.sort_values(['strikePrice'])
pe_data = pe_data.sort_values(['strikePrice'])
print(ce_values) # PRINT 3 NO OUTPUT NO ERROR
def main():
fetch_oi()
if __name__ == '__main__':
main()
your str conversion was failing and requests handle had missing parameters, I have modified your code, should work below
import requests
import json
import pandas as pd
new_url = 'https://www.nseindia.com/api/option-chain-indices?symbol=BANKNIFTY'
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(new_url,headers=headers)
dajs = json.loads(page.text)
def fetch_oi(expiry_dt):
ce_values = [data['CE'] for data in dajs['records']['data'] if "CE" in data and data['expiryDate'] == expiry_dt]
pe_values = [data['PE'] for data in dajs['records']['data'] if "PE" in data and data['expiryDate'] == expiry_dt]
ce_dt = pd.DataFrame(ce_values).sort_values(['strikePrice'])
pe_dt = pd.DataFrame(pe_values).sort_values(['strikePrice'])
print(ce_dt[['strikePrice','lastPrice']])
def main():
expiry_dt = '27-Aug-2020'
fetch_oi(expiry_dt)
if __name__ == '__main__':
main()
Now they have added 2 main cookies which determines if you are authentic user or not
Cookie Names
nsit , nseappid
Couldnt find how these both cookies are being set into the browser.
At first visit to NSE site these 2 cookies are being set somehow ofcourse with some expiration. For each resource request For Eg https://www.nseindia.com/api/option-chain-indices?COUNTER these two cookies needed to be set into request headers inorder to get data.
Maybe I am late for this answer. but below script is working fine for me
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; '
'x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
main_url = "https://www.nseindia.com/"
response = requests.get(main_url, headers=headers)
#print(response.status_code)
cookies = response.cookies
url = "https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY"
bank_nifty_oi_data = requests.get(url, headers=headers, cookies=cookies)
print(bank_nifty_oi_data.json())
Thanks,
You can repetitively call the url until you get the data:
url = 'https://www.nseindia.com/api/option-chain-indices?symbol='+symbol
found = False
while not found:
try:
data = requests.get(url, headers=urlheader).content
data2 = data.decode('utf-8')
df = json.loads(data2)
expiry_dt = df['records']['expiryDates'][0]
found = True
except:
pass

Python 3 : HTTP Error 405: Method Not Allowed

I'm getting 'HTTP Error 405: Method Not Allowed' error. My code is
import urllib.request
import urllib.parse
try:
url = 'https://www.google.com/search'
values = {'q': 'python programming tutorials'}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8') # data should be bytes
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
req = urllib.request.Request(url, data, headers = headers)
resp = urllib.request.urlopen(req)
print("HERE")
respData = resp.read()
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(respData))
saveFile.close()
except Exception as e:
print(e)
The error I guess is in req = urllib.request.Request(url, data, headers = headers). What is the error, syntactical? What should be changed in code? And any conceptual mistake do correct me.
EDIT
Concept:
def URLRequest(url, params, method="GET"):
if method == "POST":
return urllib2.Request(url, data=urllib.urlencode(params))
else:
return urllib2.Request(url + "?" + urllib.urlencode(params))
You can use Requests library instead. It's much cleaner than urllib
import requests
q = 'Whatever you want to search'
url = 'https://www.google.com/search'
response = requests.get(url+'?'+'q='+q)
saveFile = open('response.txt', 'w')
savefile.write(response.text)
savefile.close()
Or if you want to stick to the urllib , you can do this:
import urllib.request
url = 'https://www.google.com/search'
q = 'Search Query'
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
request = urllib.request.Request(url+'?'+'q='+q, headers=headers)
response = urllib.request.urlopen(request).read() # the text of the response is here
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(response))
saveFile.close()
Here in reference to www.pythonforbeginners
# Importing the module
import urllib.request
# your search text
text="hi google"
# Define the url
url = 'http://www.google.com/#q='+text
# Add your headers
headers = {'User-Agent' : 'Mozilla 5.10'}
# Create the Request.
request = urllib.request.Request(url, None, headers)
# Getting the response
response = urllib.request.urlopen(request)
# Print the headers
print (response.read())

python3 login website tumblr.com

How can I log in tumblr using requests in python3?
Here is my code but it dosn't work well and go back to the login page.
I used request.post to post a log-in form data, and failed.
import requests
from bs4 import BeautifulSoup
start_url = 'https://www.tumblr.com'
# set a session for request
s = requests.Session()
s.headers.update({'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0', 'accept-language': 'zh-CN,zh;'}
)
# get the form_key for login_in
r = s.get(start_url)
login_soup = BeautifulSoup(r.text, 'lxml')
hidden_div = login_soup.find('div', class_='form_row_hidden').find_all('input')
key_dict = {}
for input_tag in hidden_div:
tmp_dict = input_tag.attrs
key_dict.update({tmp_dict['name']: tmp_dict['value']})
user_data_dict = {'determine_email': '×××××××××',
'user[email]': '××××××××',
'user[password]': '××××××××',
'user[age]': '',
'tumblelog[name]': ''}
key_dict.update(user_data_dict)
# log in tumblr
r_login=s.post(start_url, headers=headers, data=key_dict)
home_soup=BeautifulSoup(r.text, 'lxml')
print(home_soup)
# the output is still the log-in page.
You're nearly to target.
Firstly, you have to make a request to tumblr login page (https://tumblr.com/login). (You did)
Then, you have to parse html page and get form_key value. This value is used to make a real login.
Finally, make a post request, with the payload:
{'user[email]': your_mail,
'user[password]': your_pass,
'form_key': form_key
}
Below is sample code in python 2, but I'm not using BeautifulSoup ( you asked to use requests only ;)
In [1]: import requests
In [2]: from lxml import html
In [3]: url = 'https://www.tumblr.com/login'
In [4]: ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36'
In [5]: headers = {'User-Agent': ua}
In [6]: s = requests.session()
In [7]: lg = s.post(url, headers=headers)
In [8]: lg_html = html.fromstring(str(lg.text))
In [9]: form_key = lg_html.xpath("//meta[#name='tumblr-form-key']/#content")[0]
In [10]: payload = {'user[email]': 'your_mail',
....: 'user[password]': 'your_pass',
....: 'form_key': form_key}
In [11]: # real login
In [12]: s.post(url, headers=headers, data=payload)
Out[12]: <Response [200]>
In [13]: print s.get('https://www.tumblr.com/svc/post/get_post_form_builder_data').text
{"meta":{"status":200,"msg":"OK"},"response":{"channels":[{"name":"your_name","tags":[]}],"limits":{"videoSecondsRemaining":300,"preuploadPhotoUsed":0,"preuploadAudioUsed":0,"inlineEmbedsPerPost":5}}}

Google-Forms response with Python?

I'm trying to write a Python-Script which makes it possible to submit responses in Google-Forms like this one:
https://docs.google.com/forms/d/152CTd4VY9pRvLfeACOf6SmmtFAp1CL750Sx72Rh6HJ8/viewform
But how to I actually send the POST and how can I find out, what this POST should actually contain?
First pip install requests
You have to post some specific form data to a specific url,you can use requests.The form_data dict params are correspondent to options,if you don't need some options,just remove it from form_data.
import requests
url = 'https://docs.google.com/forms/d/152CTd4VY9pRvLfeACOf6SmmtFAp1CL750Sx72Rh6HJ8/formResponse'
form_data = {'entry.2020959411':'18+ sollte absolute Pflicht sein',
'entry.2020959411':'Alter sollte garkeine Rolle spielen',
'entry.2020959411':'17+ wäre für mich vertretbar',
'entry.2020959411':'16+ wäre für mich vertretbar',
'entry.2020959411':'15+ wäre für mich vertretbar',
'entry.2020959411':'Ausnahmen von der Regel - Dafür?',
'entry.2020959411':'Ausnahmen von der Regel - Dagegen?',
'entry.2020959411':'__other_option__',
'entry.2020959411.other_option_response':'test',
'draftResponse':[],
'pageHistory':0}
user_agent = {'Referer':'https://docs.google.com/forms/d/152CTd4VY9pRvLfeACOf6SmmtFAp1CL750Sx72Rh6HJ8/viewform','User-Agent': "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36"}
r = requests.post(url, data=form_data, headers=user_agent)
Based on the answer from #pigletfly I wrote a little script for harvesting the field names (for a text-field only form)
import urllib.request
from bs4 import BeautifulSoup
import requests, warnings
def get_questions(in_url):
res = urllib.request.urlopen(in_url)
soup = BeautifulSoup(res.read(), 'html.parser')
get_names = lambda f: [v for k,v in f.attrs.items() if 'label' in k]
get_name = lambda f: get_names(f)[0] if len(get_names(f))>0 else 'unknown'
all_questions = soup.form.findChildren(attrs={'name': lambda x: x and x.startswith('entry.')})
return {get_name(q): q['name'] for q in all_questions}
def submit_response(form_url, cur_questions, verbose=False, **answers):
submit_url = form_url.replace('/viewform', '/formResponse')
form_data = {'draftResponse':[],
'pageHistory':0}
for v in cur_questions.values():
form_data[v] = ''
for k, v in answers.items():
if k in cur_questions:
form_data[cur_questions[k]] = v
else:
warnings.warn('Unknown Question: {}'.format(k), RuntimeWarning)
if verbose:
print(form_data)
user_agent = {'Referer':form_url,
'User-Agent': "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.52 Safari/537.36"}
return requests.post(submit_url, data=form_data, headers=user_agent)
You can then use the get_questions function to get the fields which you can fill in
TEST_FORM_URL = "https://docs.google.com/forms/d/e/1FAIpQLSfBmvqCVeDA7IZP2_mw_HZ0OTgDk2a0JN4VlY5KScECWC-_yw/viewform"
anno_questions = get_questions(TEST_FORM_URL)
To get the questions (fields) as a dict
{'annotator': 'entry.756364489',
'task': 'entry.1368373366',
'item_id': 'entry.84713541',
'label': 'entry.2072511216',
'session': 'entry.2021127767',
'time': 'entry.1122475936'}
then use the submit_response with keyword arguments to submit
submit_response(TEST_FORM_URL, anno_questions, annotator="TestUser", item_id = 0)
Here is my script which works:
import urllib
import urllib2
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
header={'User-Agent' : user_agent}
url = "http://....Your google form"
# values from your form. You will need to include any hidden variables if you want to..
values= {
'entry.asdfsdfsdasd': 'asdfasdfsd',
'draftResponse':'[,,"-asdfasdasdf"]',
'pageHistory':'0',
'fbzx':'-asdfasdfsd'
}
data = urllib.urlencode(values)
urllib2.Request(url, data, header)
I would use urllib2 and urllib to send the post.
Do something like this:
import urllib2, urllib
import cookielib
cookieJar = cookielib.LWPCookieJar()
opener = urllib2.build_opener(
urllib2.HTTPCookieProcessor(self.cookieJar), # Create Opener
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0))
# Add Headers
opener.addheaders = [('User-agent', "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36")]
forms = {
"formname": value, # The forms name and the selected value you want
"formname2": value2,
}
data = urllib.urlencode(forms) # Encode data
req = urllib2.Request('http://www.example.com',data) # Send Request
res = opener.open(req) # Open Request
html = res.read() # Read Response
you should structure it a bit like that.
To get the form names you need to look at the source code of the site and find the names of the forms you want to enter into and submit.
Hope this Helps
Good Luck:)

Categories

Resources