This question already has answers here:
How do I split the definition of a long string over multiple lines?
(30 answers)
Closed 1 year ago.
The community reviewed whether to reopen this question 1 year ago and left it closed:
Duplicate This question has been answered, is not unique, and doesn’t differentiate itself from another question.
This is my code in a function:
def tri():
import requests, json, urllib.parse
username = "usernam"
password = "pass"
r = requests.Session()
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0'}
res = r.get('https://www.instagram.com/', headers=hd)
payload = {'username':username,'enc_password':'#PWD_INSTAGRAM_BROWSER:0:1254625879:'+password,'queryParams':'{}','optIntoOneTap':'false'}
headers_text = '''Host: www.instagram.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
X-CSRFToken: %s
X-IG-WWW-Claim: 0
Content-Type: application/x-www-form-urlencoded
X-Requested-With: XMLHttpRequest
Content-Length: %s
Origin: https://www.instagram.com
Referer: https://www.instagram.com/
Cookie: ig_did=%s; csrftoken=%s; mid=%s
TE: Trailers'''%(res.cookies['csrftoken'],str(len(urllib.parse.urlencode(payload))),res.cookies['ig_did'],res.cookies['csrftoken'],res.cookies['mid'])
payload_headers = {i.split(': ')[0]:i.split(': ')[1] for i in headers_text.split('\n')}
resp = r.post("https://www.instagram.com/accounts/login/ajax/", headers=payload_headers,data=payload)
if json.loads(resp.text)["authenticated"] == True:
print('[+] Login successfully!')
#print(resp.text)
else:
print(json.loads(resp.text))
#print(word)
tri()
I want to login to Instagram via python requests library, and my code work well without function or loop but when I put my code under a function or loop like this, my code gets this error:
Traceback (most recent call last):
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 31, in <module>
start(fakepyfile,mainpyfile)
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 30, in start
exec(open(mainpyfile).read(), __main__.__dict__)
File "<string>", line 40, in <module>
File "<string>", line 32, in tri
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 590, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1281, in _send_request
self.putheader(hdr, value)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connection.py", line 219, in putheader
_HTTPConnection.putheader(self, header, *values)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1208, in putheader raise ValueError('Invalid header name %r' % (header,))
ValueError: Invalid header name b'\tUser-Agent'
I don't know what's going on.
I want to put my code into a function or loop.
Also I'm coding in Android.
Your headers_text variable contain tab characters because it's indented. Multiline strings are literal and include all newlines and spaces.
Remove the indentation or construct your string another way.
headers_text = '''Host: www.instagram.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0
Accept: */*
...
'''
# or...
headers_text = (
"Host: www.instagram.com\n"
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0\n"
"Accept: */*\n")
...
# or... (less better)
headers_text = "Host: www.instagram.com\n" \
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0\n" \
"Accept: */*\n" \
...
Related
I am having some trouble trying to scrape through these 2 specific pages and don't really see where the problem is. If you have any ideas or advices I am all ears !
Thanks in advance !
import scrapy
class SneakersSpider(scrapy.Spider):
name = "sneakers"
def start_requests(self):
headers = {'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
urls = [
#"https://stockx.com/fr-fr/retro-jordans",
"https://stockx.com/fr-fr/retro-jordans?page=2",
"https://stockx.com/fr-fr/retro-jordans?page=3",
]
for url in urls:
yield scrapy.Request(url = url, callback =self.parse ,headers = headers)
def parse(self,response):
page = response.url.split("=")[-1]
filename = f'sneakers-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
Looking at the traceback always helps. You should see something like this in your spider's output:
Traceback (most recent call last):
File "c:\program files\python37\lib\site-packages\scrapy\core\engine.py", line 127, in _next_request
request = next(slot.start_requests)
File "D:\Users\Ivan\Documents\Python\a.py", line 15, in start_requests
yield scrapy.Request(url = url, callback =self.parse ,headers = headers)
File "c:\program files\python37\lib\site-packages\scrapy\http\request\__init__.py", line 39, in __init__
self.headers = Headers(headers or {}, encoding=encoding)
File "c:\program files\python37\lib\site-packages\scrapy\http\headers.py", line 12, in __init__
super(Headers, self).__init__(seq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 193, in __init__
self.update(seq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 229, in update
super(CaselessDict, self).update(iseq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 228, in <genexpr>
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
ValueError: too many values to unpack (expected 2)
As you can see, there is a problem in the code that handles request headers.
headers is a set in your code; it should be a dict instead.
This works without a problem:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
Another way to set a default user agent for all requests is using the USER_AGENT setting.
i am trying scrape information from this website but keep getting status code: 403,
So tried using header but got TypeError: request() got an unexpected keyword argument 'header'
Code:
import requests
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
url = "https://www.accuweather.com/en/bd/dhaka/28143/current-weather/28143"
pageObj = requests.get(url, header = head)
print("Status code: " + str(pageObj.status_code)) # *for testing purpose*
Error:
Traceback (most recent call last):
File "F:/Python/PyCharm Community Edition 2019.2.3/Workshop/WEB_SCRAPING/test2.py", line 6, in <module>
pageObj = requests.get(url, header = head)
File "F:\Python\PyCharm Community Edition 2019.2.3\Workshop\WEB_SCRAPING\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "F:\Python\PyCharm Community Edition 2019.2.3\Workshop\WEB_SCRAPING\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
TypeError: request() got an unexpected keyword argument 'header'
header from firefox dev tool
what am i doing wrong?
The name of the argument is headers, not header. See the docs.
Use pageObj = requests.get(url, headers=head)
You need to set header in params:
import requests
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
url = "https://www.accuweather.com/en/bd/dhaka/28143/current-weather/28143"
PARAMS = {'header':head}
pageObj = requests.get(url, params = PARAMS)
I'm new to python.
I've made a list with URLs and I want to do urllib.request for all the URLs inside the list. My list currently has 5 URLs however I can only request one index at a time urlib.Request(List[0]) and if I do urlib.Request(List[0:4]) I'm getting an error
Traceback (most recent call last):
File "c:/Users/Farzad/Desktop/Python/Webscraping/Responseheaderinfo.py", line 22, in <module>
response = urllib.urlopen(request)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 548, in _open
'unknown_open', req)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: ['http>
import urllib.request as urllib
import socket
import pyodbc
from datetime import datetime
import ssl
import OpenSSL
List = open("C:\\Users\\Farzad\\Desktop\\hosts.txt").read().splitlines()
length = len(List)
for i in range(length):
print(List)
request = urllib.Request(List[0])
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')
response = urllib.urlopen(request)
rdata = response.info()
ipaddr = socket.gethostbyname(request.origin_req_host)
The code could be as the follows:
import urllib.request as urllib
import socket
import pyodbc
from datetime import datetime
import ssl
import OpenSSL
import logging
from celery.app.log import Logging
List = open("C:\\Users\\Farzad\\Desktop\\hosts.txt").read().splitlines()
length = len(List)
for url in List:
print(url)
try:
request = urllib.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')
response = urllib.urlopen(request)
rdata = response.info()
ipaddr = socket.gethostbyname(request.origin_req_host)
except Exception as e:
print(logging.traceback.format_exc())
Working on a little script to fetch info from websites. I'm having trouble with HTTP errors.
req = urllib.request.Request(lnk['href'],
headers={'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'})
page = urllib.request.urlopen(req)
When this triest to fetch, for example, http://www.guru99.com/node-js-tutorial.html I get a long series of errors, ending with 406 Unacceptable:
Traceback (most recent call last):
File "get_links.py", line 45, in <module>
page = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 471, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 581, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 509, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 443, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 589, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 406: Not Acceptable
Googling around I have found that I should fix the headers (as I have done above) and lots of tutorials about how to fix the headers. Except - not much actually works.
Is there some set of good headers which are likely to not cause a problem with most sites? Is there some python module someone else has created that already includes commonly-working headers? Is there a good way to retry several times with different headers until you get a good response?
This seems like a problem everybody who does web scraping with Python deals with, and I haven't found a decent solution.
HTTP Error 406 Not acceptable
The HyperText Transfer Protocol (HTTP) 406 Not Acceptable client error
response code indicates that the server cannot produce a response
matching the list of acceptable values defined in the request's
proactive content negotiation headers, and that the server is
unwilling to supply a default representation.
So I can see the issue is with your both User-Agent: Mozilla/5.0 key and value. Here are the links of the bunch of correct User Agents,
devicesatlsas.com
developer.chrome.com
developer.mozilla.org
So change your code to the following,
headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'})
I know the answer is too late but hope this helps someone else.
The following set of headers seems to be working for most tested. If anyone else has suggestions, please offer them. I'm also interested in good solutions for trying different headers if one set doesn't work.
req = urllib.request.Request(lnk['href'],
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
page = urllib.request.urlopen(req)
I tried your code and I get the same error like expected.
I also tried it with the User-Agent my Chrome-Browser provides, this seems to work
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36
.. and also run a test without passing an explicit header which also returned http 200 (success). This will use the default header which is provided by the library, e.g.
python-requests/2.10.0
Hope this helps
Requests documentation
I'm trying to login to Paychex's Time and Labor website with Python to keep track of my hours automatically.
What I did so far was login to Paychex using Chrome, looked at the various requests in the "Network" tab of the Developer Tools, and translate the requests into Python Requests format:
import requests
import json
import sys
USER = 'username'
PASSWORD = 'password'
IMAGE_URL = 'https://landing.paychex.com/ssologin/Login.aspx/GetSecurityImage'
LOGIN_URL = 'https://landing.paychex.com/ssologin/Login.aspx/ProcessLogin'
LOGINFCC_URL = 'https://landing.paychex.com/ssologin/login.fcc'
def main():
session = requests.session()
# Get security image
data = { 'enteredUsername': USER }
session.headers.update({
'Origin': 'https://landing.paychex.com',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36',
'Content-Type': 'application/json; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*',
'Referer': 'https://landing.paychex.com/ssologin/login.aspx',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
})
r = session.post(IMAGE_URL, data=json.dumps(data))
data = { 'eu': USER, 'ep': PASSWORD }
session.headers.update({
'Host': 'landing.paychex.com',
'Connection': 'keep-alive',
'Content-Length': '34',
'Accept': 'application/json, text/javascript, */*',
'Origin':' https://landing.paychex.com',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36',
'Content-Type': 'application/json; charset=UTF-8',
'Referer':' https://landing.paychex.com/ssologin/login.aspx',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8',
})
r = session.post(LOGIN_URL, data=json.dumps(data))
session.headers.update({
'Host': 'landing.paychex.com',
'Connection': 'keep-alive',
'Content-Length': '653',
'Cache-Control': 'max-age=0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Origin': 'https://landing.paychex.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': 'https://landing.paychex.com/ssologin/login.aspx',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.8',
})
data = {
'__LASTFOCUS': '',
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': '',
'__EVENTVALIDATION': '',
'SMENC': 'ISO-8859-1',
'SMLOCALE': 'US-EN',
'target': '/LandingRedirect.aspx',
'USER': USER,
'PASSWORD': PASSWORD,
}
r = session.post(LOGINFCC_URL, data=data) # Returns "Welcome USER" page. Successful login.
session.headers.update({
'Host': 'timeandlabor.paychex.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
})
# Should return time and labor page, but instead it hangs here until connection is reset by peer.
r = session.get('https://timeandlabor.paychex.com/secure/EmployeeHome.aspx')
print(r.content.decode('utf-8'))
if __name__ == '__main__':
main()
I that understand many of the headers are redundant. The third request returns the "Welcome, Name!" page, so it's definitely logging in successfully. The problem arises when I try to request the time and labor page with a GET (near the bottom). It just hangs there until the connection is reset. What am I doing wrong?
Stack trace:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 319, in _make_request
httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 493, in urlopen
body=body, headers=headers)
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 321, in _make_request
httplib_response = conn.getresponse()
File "/usr/local/lib/python3.4/http/client.py", line 1172, in getresponse
response.begin()
File "/usr/local/lib/python3.4/http/client.py", line 351, in begin
version, status, reason = self._read_status()
File "/usr/local/lib/python3.4/http/client.py", line 313, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "/usr/local/lib/python3.4/socket.py", line 371, in readinto
return self._sock.recv_into(b)
File "/usr/local/lib/python3.4/ssl.py", line 746, in recv_into
return self.read(nbytes, buffer)
File "/usr/local/lib/python3.4/ssl.py", line 618, in read
v = self._sslobj.read(len, buffer)
ConnectionResetError: [Errno 104] Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 327, in send
timeout=timeout
File "/usr/local/lib/python3.4/site-packages/requests/packages/urllib3/connectionpool.py", line 543, in urlopen
raise MaxRetryError(self, url, e)
requests.packages.urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='timeandlabor.paychex.com', port=443): Max retries exceeded with url: /secure/EmployeeHome.aspx (Caused by <class 'ConnectionResetError'>: [Errno 104] Connection reset by peer)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "paychex.py", line 89, in <module>
main()
File "paychex.py", line 84, in main
r = session.get('https://timeandlabor.paychex.com/secure/EmployeeHome.aspx')
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 468, in get
return self.request('GET', url, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 456, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/sessions.py", line 559, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.4/site-packages/requests/adapters.py", line 375, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='timeandlabor.paychex.com', port=443): Max retries exceeded with url:
/secure/EmployeeHome.aspx (Caused by <class 'ConnectionResetError'>: [Errno 104] Connection reset by peer)
Using Python 3.4.2.