I am having some trouble trying to scrape through these 2 specific pages and don't really see where the problem is. If you have any ideas or advices I am all ears !
Thanks in advance !
import scrapy
class SneakersSpider(scrapy.Spider):
name = "sneakers"
def start_requests(self):
headers = {'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
urls = [
#"https://stockx.com/fr-fr/retro-jordans",
"https://stockx.com/fr-fr/retro-jordans?page=2",
"https://stockx.com/fr-fr/retro-jordans?page=3",
]
for url in urls:
yield scrapy.Request(url = url, callback =self.parse ,headers = headers)
def parse(self,response):
page = response.url.split("=")[-1]
filename = f'sneakers-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
Looking at the traceback always helps. You should see something like this in your spider's output:
Traceback (most recent call last):
File "c:\program files\python37\lib\site-packages\scrapy\core\engine.py", line 127, in _next_request
request = next(slot.start_requests)
File "D:\Users\Ivan\Documents\Python\a.py", line 15, in start_requests
yield scrapy.Request(url = url, callback =self.parse ,headers = headers)
File "c:\program files\python37\lib\site-packages\scrapy\http\request\__init__.py", line 39, in __init__
self.headers = Headers(headers or {}, encoding=encoding)
File "c:\program files\python37\lib\site-packages\scrapy\http\headers.py", line 12, in __init__
super(Headers, self).__init__(seq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 193, in __init__
self.update(seq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 229, in update
super(CaselessDict, self).update(iseq)
File "c:\program files\python37\lib\site-packages\scrapy\utils\datatypes.py", line 228, in <genexpr>
iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
ValueError: too many values to unpack (expected 2)
As you can see, there is a problem in the code that handles request headers.
headers is a set in your code; it should be a dict instead.
This works without a problem:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'}
Another way to set a default user agent for all requests is using the USER_AGENT setting.
Related
This question already has answers here:
How do I split the definition of a long string over multiple lines?
(30 answers)
Closed 1 year ago.
The community reviewed whether to reopen this question 1 year ago and left it closed:
Duplicate This question has been answered, is not unique, and doesn’t differentiate itself from another question.
This is my code in a function:
def tri():
import requests, json, urllib.parse
username = "usernam"
password = "pass"
r = requests.Session()
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0'}
res = r.get('https://www.instagram.com/', headers=hd)
payload = {'username':username,'enc_password':'#PWD_INSTAGRAM_BROWSER:0:1254625879:'+password,'queryParams':'{}','optIntoOneTap':'false'}
headers_text = '''Host: www.instagram.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0
Accept: */*
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
X-CSRFToken: %s
X-IG-WWW-Claim: 0
Content-Type: application/x-www-form-urlencoded
X-Requested-With: XMLHttpRequest
Content-Length: %s
Origin: https://www.instagram.com
Referer: https://www.instagram.com/
Cookie: ig_did=%s; csrftoken=%s; mid=%s
TE: Trailers'''%(res.cookies['csrftoken'],str(len(urllib.parse.urlencode(payload))),res.cookies['ig_did'],res.cookies['csrftoken'],res.cookies['mid'])
payload_headers = {i.split(': ')[0]:i.split(': ')[1] for i in headers_text.split('\n')}
resp = r.post("https://www.instagram.com/accounts/login/ajax/", headers=payload_headers,data=payload)
if json.loads(resp.text)["authenticated"] == True:
print('[+] Login successfully!')
#print(resp.text)
else:
print(json.loads(resp.text))
#print(word)
tri()
I want to login to Instagram via python requests library, and my code work well without function or loop but when I put my code under a function or loop like this, my code gets this error:
Traceback (most recent call last):
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 31, in <module>
start(fakepyfile,mainpyfile)
File "/data/user/0/ru.iiec.pydroid3/files/accomp_files/iiec_run/iiec_run.py", line 30, in start
exec(open(mainpyfile).read(), __main__.__dict__)
File "<string>", line 40, in <module>
File "<string>", line 32, in tri
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 590, in post
return self.request('POST', url, data=data, json=json, **kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connection.py", line 234, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1281, in _send_request
self.putheader(hdr, value)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/site-packages/urllib3/connection.py", line 219, in putheader
_HTTPConnection.putheader(self, header, *values)
File "/data/user/0/ru.iiec.pydroid3/files/aarch64-linux-android/lib/python3.8/http/client.py", line 1208, in putheader raise ValueError('Invalid header name %r' % (header,))
ValueError: Invalid header name b'\tUser-Agent'
I don't know what's going on.
I want to put my code into a function or loop.
Also I'm coding in Android.
Your headers_text variable contain tab characters because it's indented. Multiline strings are literal and include all newlines and spaces.
Remove the indentation or construct your string another way.
headers_text = '''Host: www.instagram.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0
Accept: */*
...
'''
# or...
headers_text = (
"Host: www.instagram.com\n"
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0\n"
"Accept: */*\n")
...
# or... (less better)
headers_text = "Host: www.instagram.com\n" \
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0\n" \
"Accept: */*\n" \
...
i am trying scrape information from this website but keep getting status code: 403,
So tried using header but got TypeError: request() got an unexpected keyword argument 'header'
Code:
import requests
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
url = "https://www.accuweather.com/en/bd/dhaka/28143/current-weather/28143"
pageObj = requests.get(url, header = head)
print("Status code: " + str(pageObj.status_code)) # *for testing purpose*
Error:
Traceback (most recent call last):
File "F:/Python/PyCharm Community Edition 2019.2.3/Workshop/WEB_SCRAPING/test2.py", line 6, in <module>
pageObj = requests.get(url, header = head)
File "F:\Python\PyCharm Community Edition 2019.2.3\Workshop\WEB_SCRAPING\venv\lib\site-packages\requests\api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "F:\Python\PyCharm Community Edition 2019.2.3\Workshop\WEB_SCRAPING\venv\lib\site-packages\requests\api.py", line 60, in request
return session.request(method=method, url=url, **kwargs)
TypeError: request() got an unexpected keyword argument 'header'
header from firefox dev tool
what am i doing wrong?
The name of the argument is headers, not header. See the docs.
Use pageObj = requests.get(url, headers=head)
You need to set header in params:
import requests
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'}
url = "https://www.accuweather.com/en/bd/dhaka/28143/current-weather/28143"
PARAMS = {'header':head}
pageObj = requests.get(url, params = PARAMS)
I'm new to python.
I've made a list with URLs and I want to do urllib.request for all the URLs inside the list. My list currently has 5 URLs however I can only request one index at a time urlib.Request(List[0]) and if I do urlib.Request(List[0:4]) I'm getting an error
Traceback (most recent call last):
File "c:/Users/Farzad/Desktop/Python/Webscraping/Responseheaderinfo.py", line 22, in <module>
response = urllib.urlopen(request)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 525, in open
response = self._open(req, data)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 548, in _open
'unknown_open', req)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\Users\Farzad\AppData\Local\Programs\Python\Python37-32\lib\urllib\request.py", line 1387, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: ['http>
import urllib.request as urllib
import socket
import pyodbc
from datetime import datetime
import ssl
import OpenSSL
List = open("C:\\Users\\Farzad\\Desktop\\hosts.txt").read().splitlines()
length = len(List)
for i in range(length):
print(List)
request = urllib.Request(List[0])
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')
response = urllib.urlopen(request)
rdata = response.info()
ipaddr = socket.gethostbyname(request.origin_req_host)
The code could be as the follows:
import urllib.request as urllib
import socket
import pyodbc
from datetime import datetime
import ssl
import OpenSSL
import logging
from celery.app.log import Logging
List = open("C:\\Users\\Farzad\\Desktop\\hosts.txt").read().splitlines()
length = len(List)
for url in List:
print(url)
try:
request = urllib.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36')
response = urllib.urlopen(request)
rdata = response.info()
ipaddr = socket.gethostbyname(request.origin_req_host)
except Exception as e:
print(logging.traceback.format_exc())
Recenetly I have been doing the web spider for fun. I want to learn how to login a website with verify code. One way I learnt is to use cookies. So I had a try. But I realised a problem.
For example, I want to use request.session to get url:www.lovetvshow.com
And I can get all the html text, but when I was trying to convert it to Json, it failed. It always shows "[ValueError] No JSON object could be decoded". But I have already had the text. Why is it no json object?
session = requests.session()
login_data = {'email': email, 'password': password}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Host': 'www.lovetvshow.com'
}
# r = session.post('http://www.renren.com/', data=login_data, headers=header)
r = session.get('http://www.lovetvshow.com/',headers=header)
print r
print r.json()
This will yield:
<Response [200]>
Traceback (most recent call last):
File "C:/Users/Hao/PycharmProjects/WebSpiderTutorial1/WebSpiderTutorial1.py", line 128, in <module>
requests_session, requests_cookies = create_session()
File "C:/Users/Hao/PycharmProjects/WebSpiderTutorial1/WebSpiderTutorial1.py", line 104, in create_session
print r.json()
File "C:\Python27\lib\site-packages\requests\models.py", line 892, in json
return complexjson.loads(self.text, **kwargs)
File "C:\Python27\lib\json\__init__.py", line 338, in loads
return _default_decoder.decode(s)
File "C:\Python27\lib\json\decoder.py", line 365, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Python27\lib\json\decoder.py", line 383, in raw_decode
raise ValueError("No JSON object could be decoded")
ValueError: No JSON object could be decoded
Any suggestions? Thanks a head.
You have to make sure there is JSON to be decoded. Check the complete text using
print print r.text . And look for your json
I am trying to eventually make a program parsing the html of a particular website, but I get a bad status line error for the website I'd like to use. This code has worked fine for any other website I've tried. Is this something they are doing intentionally and there is nothing I can do?
My code:
from lxml import html
import requests
webpage = 'http://www.whosampled.com/search/?q=de+la+soul'
page = requests.get(webpage)
tree = html.fromstring(page.text)
The error message I receive:
Traceback (most recent call last):
File "/home/kyle/Documents/web.py", line 6, in <module>
page = requests.get(webpage)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
ConnectionError: ('Connection aborted.', BadStatusLine("''",))
Provide a User-Agent header and it would work for you:
webpage = 'http://www.whosampled.com/search/?q=de+la+soul'
page = requests.get(webpage,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
Proof:
>>> from lxml import html
>>> import requests
>>>
>>> webpage = 'http://www.whosampled.com/search/?q=de+la+soul'
>>> page = requests.get(webpage, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
>>> tree = html.fromstring(page.content)
>>> tree.findtext('.//title')
Search Results for "de la soul" | WhoSampled
FYI, it would also work if you switch to https:
>>> webpage = 'https://www.whosampled.com/search/?q=de+la+soul'
>>> page = requests.get(webpage)
>>> tree = html.fromstring(page.content)
>>> tree.findtext('.//title')
'Search Results for "de la soul" | WhoSampled'