Python execute a request with multiple URLs from a list - python

I'm new to python.
I've made a list with URLs and I want to do urllib.request for all the URLs inside the list. My list currently has 5 URLs however I can only request one index at a time urlib.Request(List[0]) and if I do urlib.Request(List[0:4]) I'm getting an error
Traceback (most recent call last):
File "c:/Users/Farzad/Desktop/Python/Webscraping/Responseheaderinfo.py", line 22, in <module>
response = urllib.urlopen(request)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 548, in _open
'unknown_open', req)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: ['http>
import urllib.request as urllib
import socket
import pyodbc
from datetime import datetime
import ssl
import OpenSSL
List = open("C:\\Users\\Farzad\\Desktop\\hosts.txt").read().splitlines()
length = len(List)
for i in range(length):
print(List)
request = urllib.Request(List[0])
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')
response = urllib.urlopen(request)
rdata = response.info()
ipaddr = socket.gethostbyname(request.origin_req_host)

The code could be as the follows:
import urllib.request as urllib
import socket
import pyodbc
from datetime import datetime
import ssl
import OpenSSL
import logging
from celery.app.log import Logging
List = open("C:\\Users\\Farzad\\Desktop\\hosts.txt").read().splitlines()
length = len(List)
for url in List:
print(url)
try:
request = urllib.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')
response = urllib.urlopen(request)
rdata = response.info()
ipaddr = socket.gethostbyname(request.origin_req_host)
except Exception as e:
print(logging.traceback.format_exc())

Related

Getting error after making code a function or loop [duplicate]

This question already has answers here:
How do I split the definition of a long string over multiple lines?
(30 answers)
Closed 1 year ago.
The community reviewed whether to reopen this question 1 year ago and left it closed:
Duplicate This question has been answered, is not unique, and doesn’t differentiate itself from another question.
This is my code in a function:
def tri():
import requests, json, urllib.parse
username = "usernam"
password = "pass"
r = requests.Session()
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0'}
res = r.get('https://www.instagram.com/', headers=hd)
payload = {'username':username,'enc_password':'#PWD_INSTAGRAM_BROWSER:0:1254625879:'+password,'queryParams':'{}','optIntoOneTap':'false'}
headers_text = '''Host: www.instagram.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
X-CSRFToken: %s
X-IG-WWW-Claim: 0
Content-Type: application/x-www-form-urlencoded
X-Requested-With: XMLHttpRequest
Content-Length: %s
Origin: https://www.instagram.com
Referer: https://www.instagram.com/
Cookie: ig_did=%s; csrftoken=%s; mid=%s
TE: Trailers'''%(res.cookies['csrftoken'],str(len(urllib.parse.urlencode(payload))),res.cookies['ig_did'],res.cookies['csrftoken'],res.cookies['mid'])
payload_headers = {i.split(': ')[0]:i.split(': ')[1] for i in headers_text.split('\n')}
resp = r.post("https://www.instagram.com/accounts/login/ajax/", headers=payload_headers,data=payload)
if json.loads(resp.text)["authenticated"] == True:
print('[+] Login successfully!')
#print(resp.text)
else:
print(json.loads(resp.text))
#print(word)
tri()
I want to login to Instagram via python requests library, and my code work well without function or loop but when I put my code under a function or loop like this, my code gets this error:
Traceback (most recent call last):
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 31, in <module>
start(fakepyfile,mainpyfile)
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 30, in start
exec(open(mainpyfile).read(), __main__.__dict__)
File "<string>", line 40, in <module>
File "<string>", line 32, in tri
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 590, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1281, in _send_request
self.putheader(hdr, value)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connection.py", line 219, in putheader
_HTTPConnection.putheader(self, header, *values)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1208, in putheader raise ValueError('Invalid header name %r' % (header,))
ValueError: Invalid header name b'\tUser-Agent'
I don't know what's going on.
I want to put my code into a function or loop.
Also I'm coding in Android.
Your headers_text variable contain tab characters because it's indented. Multiline strings are literal and include all newlines and spaces.
Remove the indentation or construct your string another way.
headers_text = '''Host: www.instagram.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0
Accept: */*
...
'''
# or...
headers_text = (
"Host: www.instagram.com\n"
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0\n"
"Accept: */*\n")
...
# or... (less better)
headers_text = "Host: www.instagram.com\n" \
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0\n" \
"Accept: */*\n" \
...

Error while obtaining start requests with Scrapy

I am having some trouble trying to scrape through these 2 specific pages and don't really see where the problem is. If you have any ideas or advices I am all ears !
Thanks in advance !
import scrapy
class SneakersSpider(scrapy.Spider):
name = "sneakers"
def start_requests(self):
headers = {'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
urls = [
#"https://stockx.com/fr-fr/retro-jordans",
"https://stockx.com/fr-fr/retro-jordans?page=2",
"https://stockx.com/fr-fr/retro-jordans?page=3",
]
for url in urls:
yield scrapy.Request(url = url, callback =self.parse ,headers = headers)
def parse(self,response):
page = response.url.split("=")[-1]
filename = f'sneakers-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
Looking at the traceback always helps. You should see something like this in your spider's output:
Traceback (most recent call last):
File "c:\program files\python37\lib\site-packages\scrapy\core\engine.py", line 127, in _next_request
request = next(slot.start_requests)
File "D:\Users\Ivan\Documents\Python\a.py", line 15, in start_requests
yield scrapy.Request(url = url, callback =self.parse ,headers = headers)
File "c:\program files\python37\lib\site-packages\scrapy\http\request\__init__.py", line 39, in __init__
self.headers = Headers(headers or {}, encoding=encoding)
File "c:\program files\python37\lib\site-packages\scrapy\http\headers.py", line 12, in __init__
super(Headers, self).__init__(seq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 193, in __init__
self.update(seq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 229, in update
super(CaselessDict, self).update(iseq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 228, in <genexpr>
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
ValueError: too many values to unpack (expected 2)
As you can see, there is a problem in the code that handles request headers.
headers is a set in your code; it should be a dict instead.
This works without a problem:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
Another way to set a default user agent for all requests is using the USER_AGENT setting.

TypeError: request() got an unexpected keyword argument 'header' - when i use header,403 error - without header

i am trying scrape information from this website but keep getting status code: 403,
So tried using header but got TypeError: request() got an unexpected keyword argument 'header'
Code:
import requests
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
url = "https://www.accuweather.com/en/bd/dhaka/28143/current-weather/28143"
pageObj = requests.get(url, header = head)
print("Status code: " + str(pageObj.status_code)) # *for testing purpose*
Error:
Traceback (most recent call last):
File "F:/Python/PyCharm Community Edition 2019.2.3/Workshop/WEB_SCRAPING/test2.py", line 6, in <module>
pageObj = requests.get(url, header = head)
File "F:\Python\PyCharm Community Edition 2019.2.3\Workshop\WEB_SCRAPING\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "F:\Python\PyCharm Community Edition 2019.2.3\Workshop\WEB_SCRAPING\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
TypeError: request() got an unexpected keyword argument 'header'
header from firefox dev tool
what am i doing wrong?
The name of the argument is headers, not header. See the docs.
Use pageObj = requests.get(url, headers=head)
You need to set header in params:
import requests
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
url = "https://www.accuweather.com/en/bd/dhaka/28143/current-weather/28143"
PARAMS = {'header':head}
pageObj = requests.get(url, params = PARAMS)

Python Program with urllib Module

Folks
Below program is for finding out the IP address given in the page http://whatismyipaddress.com/
import urllib2
import re
response = urllib2.urlopen('http://whatismyipaddress.com/')
p = response.readlines()
for line in p:
ip = re.findall(r'(\d+.\d+.\d+.\d+)',line)
print ip
But I am not able to trouble shoot the issue as it was giving below error
Traceback (most recent call last):
File "Test.py", line 5, in <module>
response = urllib2.urlopen('http://whatismyipaddress.com/')
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 437, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 550, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 475, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 558, in http_error_default
raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
urllib2.HTTPError: HTTP Error 403: Forbidden
anyone have any idea what change is required to remove the errors and get the required output?
The http error code 403 tells you that the server does not want to respond to your request for some reason. In this case, I think it is the user agent of your query (the default used by urllib2).
You can change the user agent:
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open('http://www.whatismyipaddress.com/')
Then your query will work.
But there is no guarantee that this will keep working. The site could decide to block automated queries.
Try this
>>> import urllib2
>>> import re
>>> site= 'http://whatismyipaddress.com/'
>>> hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
... 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
... 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
... 'Accept-Encoding': 'none',
... 'Accept-Language': 'en-US,en;q=0.8',
... 'Connection': 'keep-alive'}
>>> req = urllib2.Request(site, headers=hdr)
>>> response = urllib2.urlopen(req)
>>> p = response.readlines()
>>> for line in p:
... ip = re.findall(r'(\d+.\d+.\d+.\d+)',line)
... print ip
urllib2-httperror-http-error-403-forbidden
You may try the requests package here, instead of the urllib2
it is much easier to use :
import requests
url='http://whereismyip.com'
header = {'user-Agent':'curl/7.21.3'}
r= requests.get(url,header)
you can use curl as the user-Agent

Why am I unable to receive data from this website?

I am trying to eventually make a program parsing the html of a particular website, but I get a bad status line error for the website I'd like to use. This code has worked fine for any other website I've tried. Is this something they are doing intentionally and there is nothing I can do?
My code:
from lxml import html
import requests
webpage = 'http://www.whosampled.com/search/?q=de+la+soul'
page = requests.get(webpage)
tree = html.fromstring(page.text)
The error message I receive:
Traceback (most recent call last):
File "/home/kyle/Documents/web.py", line 6, in <module>
page = requests.get(webpage)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
ConnectionError: ('Connection aborted.', BadStatusLine("''",))
Provide a User-Agent header and it would work for you:
webpage = 'http://www.whosampled.com/search/?q=de+la+soul'
page = requests.get(webpage,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
Proof:
>>> from lxml import html
>>> import requests
>>>
>>> webpage = 'http://www.whosampled.com/search/?q=de+la+soul'
>>> page = requests.get(webpage, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
>>> tree = html.fromstring(page.content)
>>> tree.findtext('.//title')
Search Results for "de la soul" | WhoSampled
FYI, it would also work if you switch to https:
>>> webpage = 'https://www.whosampled.com/search/?q=de+la+soul'
>>> page = requests.get(webpage)
>>> tree = html.fromstring(page.content)
>>> tree.findtext('.//title')
'Search Results for "de la soul" | WhoSampled'

Categories

Resources