Scrapy Spider gives an error while processing - python

I have built a scrapy prjoect which worked fine. Then in the process of making it into an .exe file, apparently I ruined something because it now gives the following error when ran from the IDE (PyCharm):
2023-02-02 20:41:14 [scrapy.core.scraper] ERROR: Spider error processing <GET https://ra.co/dj/Antigone/past-events> (referer: None)
Traceback (most recent call last):
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\rascraper\rascraper\spiders\spiderone.py", line 41, in parse
for post in response.css(''):
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\http\response\text.py", line 141, in css
return self.selector.css(query)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\selector.py", line 456, in css
return self.xpath(self._css2xpath(query))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\selector.py", line 459, in _css2xpath
return self._csstranslator.css_to_xpath(query)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\csstranslator.py", line 104, in css_to_xpath
return super().css_to_xpath(css, prefix)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\xpath.py", line 224, in css_to_xpath
for selector in parse(css)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 543, in parse
return list(parse_selector_group(stream))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 558, in parse_selector_group
yield Selector(*parse_selector(stream))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 567, in parse_selector
result, pseudo_element = parse_simple_selector(stream)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 702, in parse_simple_selector
raise SelectorSyntaxError("Expected selector, got %s" % (stream.peek(),))
cssselect.parser.SelectorSyntaxError: Expected selector, got <EOF at 0>
2023-02-02 20:41:14 [scrapy.core.engine] INFO: Closing spider (finished)
I have tried really hard, but have no idea what the actual problem is.
Can anyone point me in the right direction?

Related

pandas getting lastrow index of excel file and appending data to it, needs some fix

i am trying to get last row of column plate and append data to it. But it gives corrupt file error even though scrapy is working properly.
I guess error is due to lines below. Where I firstly, use pandas ExcelWriter object, then for getting last row I use dataframe.
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
and variable lastRow is unassigned, as I guess. That's why it does not give a value to to_excel method
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
class plateScraper(scrapy.Spider):
name = 'scrapePlate'
allowed_domains = ['dvlaregistrations.dvla.gov.uk']
def start_requests(self):
df=pd.read_excel('data.xlsx')
columnA_values=df['PLATE']
for row in columnA_values:
global plate_num_xlsx
plate_num_xlsx=row
base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=&currentmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
url=base_url
yield scrapy.Request(url)
def parse(self, response):
itemList=[]
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate.strip(), "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
process = CrawlerProcess()
process.crawl(plateScraper)
process.start()
gives an error
Traceback (most recent call last):
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\pythonPro\w_crawl\SimonDarak\scrpy_00.py", line 33, in parse
with pd.ExcelWriter('output_res.xlsx', mode='a',if_sheet_exists='overlay') as writer:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel\_openpyxl.py", line 73, in __init__
self._book = load_workbook(self._handles.handle, **engine_kwargs)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 317, in load_workbook
reader.read()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 282, in read
self.read_worksheets()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 228, in read_worksheets
ws_parser.bind_all()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 448, in bind_all
self.bind_cells()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 351, in bind_cells
for idx, row in self.parser.parse():
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 144, in parse
for _, element in it:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\xml\etree\ElementTree.py", line 1255, in iterator
data = source.read(16 * 1024)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 925, in read
data = self._read1(n)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1015, in _read1
self._update_crc(data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 943, in _update_crc
raise BadZipFile("Bad CRC-32 for file %r" % self.name)
zipfile.BadZipFile: Bad CRC-32 for file 'xl/worksheets/sheet1.xml'
Process finished with exit code -1
I take list from parse method and put it outside class.
itemList=[]
def parse(self, response):
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
a = plate.replace(" ", "").strip()
print(plate_num_xlsx,a,a == plate_num_xlsx)
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate_num_xlsx, "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True)

Sending raw transaction from web3py: TypeError: <lambda>() missing 4 required positional arguments: 'hash', 'r', 's', and 'v'

I am trying to send raw transaction by web3py using this code:
t = w3.eth.account.sign_transaction(test_contract.functions.edit("test").buildTransaction(
{
"nonce": w3.eth.get_transaction_count(w3.eth.default_account)
}
), pkey)
w3.eth.send_raw_transaction(t)
But, where python comes to the last line, I have this error in console:
Traceback (most recent call last):
File "***/main.py", line 64, in <module>
w3.eth.send_raw_transaction(t)
File "***/venv/lib/python3.9/site-packages/web3/module.py", line 53, in caller
(method_str, params), response_formatters = method.process_params(module, *args, **kwargs) # noqa: E501
File "***/venv/lib/python3.9/site-packages/web3/method.py", line 194, in process_params
_apply_request_formatters(params, self.request_formatters(method)))
File "***/venv/lib/python3.9/site-packages/eth_utils/functional.py", line 45, in inner
return callback(fn(*args, **kwargs))
File "***/venv/lib/python3.9/site-packages/web3/method.py", line 50, in _apply_request_formatters
formatted_params = pipe(params, request_formatters)
File "cytoolz/functoolz.pyx", line 667, in cytoolz.functoolz.pipe
File "cytoolz/functoolz.pyx", line 642, in cytoolz.functoolz.c_pipe
File "cytoolz/functoolz.pyx", line 254, in cytoolz.functoolz.curry.__call__
File "cytoolz/functoolz.pyx", line 250, in cytoolz.functoolz.curry.__call__
File "***/venv/lib/python3.9/site-packages/web3/_utils/abi.py", line 799, in map_abi_data
return pipe(data, *pipeline)
File "cytoolz/functoolz.pyx", line 667, in cytoolz.functoolz.pipe
File "cytoolz/functoolz.pyx", line 642, in cytoolz.functoolz.c_pipe
File "cytoolz/functoolz.pyx", line 254, in cytoolz.functoolz.curry.__call__
File "cytoolz/functoolz.pyx", line 250, in cytoolz.functoolz.curry.__call__
File "***/venv/lib/python3.9/site-packages/web3/_utils/abi.py", line 833, in data_tree_map
return recursive_map(map_to_typed_data, data_tree)
File "***/venv/lib/python3.9/site-packages/web3/_utils/decorators.py", line 30, in wrapped
wrapped_val = to_wrap(*args)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 89, in recursive_map
items_mapped = map_collection(recurse, data)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 76, in map_collection
return datatype(map(func, collection))
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 88, in recurse
return recursive_map(func, item)
File "***/venv/lib/python3.9/site-packages/web3/_utils/decorators.py", line 30, in wrapped
wrapped_val = to_wrap(*args)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 89, in recursive_map
items_mapped = map_collection(recurse, data)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 76, in map_collection
return datatype(map(func, collection))
File "***/venv/lib/python3.9/site-packages/web3/_utils/abi.py", line 855, in __new__
return super().__new__(cls, *iterable)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 88, in recurse
return recursive_map(func, item)
File "***/venv/lib/python3.9/site-packages/web3/_utils/decorators.py", line 30, in wrapped
wrapped_val = to_wrap(*args)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 89, in recursive_map
items_mapped = map_collection(recurse, data)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 76, in map_collection
return datatype(map(func, collection))
TypeError: <lambda>() missing 4 required positional arguments: 'hash', 'r', 's', and 'v'
I am using infura custom node, that's why I cant send transaction by contract.functions.method.transact(). Don't know to do with this error, spent a lot of time reading docs and got nothing.
How can I fix that?
sign_transaction returns SignedTransaction object while send_raw_transaction accepts raw transaction bytes. So change your last line to:
w3.eth.send_raw_transaction(t.rawTransaction)
You also probably want to save the result to a variable to track the transaction later.
You need to sign the transaction before sending with your account that has ETH balance.
You need to use Signing middleware.
>>> from web3 import Web3, EthereumTesterProvider
>>> w3 = Web3(EthereumTesterProvider)
>>> from web3.middleware import construct_sign_and_send_raw_middleware
>>> from eth_account import Account
>>> acct = Account.create('KEYSMASH FJAFJKLDSKF7JKFDJ 1530')
>>> w3.middleware_onion.add(construct_sign_and_send_raw_middleware(acct))
>>> w3.eth.default_account = acct.address
# Now you can send a tx from acct.address without having to build and sign each raw transaction

Pylint resulting in RuntimeError: generator raised StopIteration with latest package versions

Here are my package versions,
$ pylint --version
pylint 2.3.1
astroid 2.2.5
Python 3.7.4 (default, Aug 14 2019, 12:09:51)
[GCC 8.3.0]
When I run
pylint {package_name}
I get a RuntimeError as shown below:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/astroid/protocols.py", line 492, in _infer_context_manager
enter = next(inferred.igetattr("__enter__", context=context))
StopIteration
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/bin/pylint", line 10, in <module>
sys.exit(run_pylint())
File "/usr/local/lib/python3.7/site-packages/pylint/__init__.py", line 20, in run_pylint
Run(sys.argv[1:])
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 1628, in __init__
linter.check(args)
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 943, in check
self._do_check(files_or_modules)
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 1075, in _do_check
self.check_astroid_module(ast_node, walker, rawcheckers, tokencheckers)
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 1158, in check_astroid_module
walker.walk(ast_node)
File "/usr/local/lib/python3.7/site-packages/pylint/utils.py", line 1303, in walk
self.walk(child)
File "/usr/local/lib/python3.7/site-packages/pylint/utils.py", line 1300, in walk
cb(astroid)
File "/usr/local/lib/python3.7/site-packages/pylint/checkers/variables.py", line 1590, in visit_import
module = next(_infer_name_module(node, parts[0]))
File "/usr/local/lib/python3.7/site-packages/astroid/util.py", line 160, in limit_inference
yield from islice(iterator, size)
File "/usr/local/lib/python3.7/site-packages/astroid/context.py", line 113, in cache_generator
for result in generator:
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 131, in raise_if_nothing_inferred
yield next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 95, in wrapped
res = next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/inference.py", line 240, in infer_import
yield self.do_import_module(name)
File "/usr/local/lib/python3.7/site-packages/astroid/mixins.py", line 100, in do_import_module
modname, level=level, relative_only=level and level >= 1
File "/usr/local/lib/python3.7/site-packages/astroid/scoped_nodes.py", line 619, in import_module
return MANAGER.ast_from_module_name(absmodname)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 171, in ast_from_module_name
return self.ast_from_file(found_spec.location, modname, fallback=False)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 91, in ast_from_file
return AstroidBuilder(self).file_build(filepath, modname)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 136, in file_build
return self._post_build(module, encoding)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 153, in _post_build
self.add_from_names_to_locals(from_node)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 206, in add_from_names_to_locals
imported = node.do_import_module()
File "/usr/local/lib/python3.7/site-packages/astroid/mixins.py", line 100, in do_import_module
modname, level=level, relative_only=level and level >= 1
File "/usr/local/lib/python3.7/site-packages/astroid/scoped_nodes.py", line 619, in import_module
return MANAGER.ast_from_module_name(absmodname)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 171, in ast_from_module_name
return self.ast_from_file(found_spec.location, modname, fallback=False)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 91, in ast_from_file
return AstroidBuilder(self).file_build(filepath, modname)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 136, in file_build
return self._post_build(module, encoding)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 156, in _post_build
self.delayed_assattr(delayed)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 223, in delayed_assattr
for inferred in node.expr.infer():
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 141, in raise_if_nothing_inferred
yield from generator
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 95, in wrapped
res = next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/inference.py", line 279, in infer_attribute
for owner in self.expr.infer(context):
File "/usr/local/lib/python3.7/site-packages/astroid/util.py", line 160, in limit_inference
yield from islice(iterator, size)
File "/usr/local/lib/python3.7/site-packages/astroid/context.py", line 113, in cache_generator
for result in generator:
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 141, in raise_if_nothing_inferred
yield from generator
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 95, in wrapped
res = next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/bases.py", line 137, in _infer_stmts
for inferred in stmt.infer(context=context):
File "/usr/local/lib/python3.7/site-packages/astroid/util.py", line 160, in limit_inference
yield from islice(iterator, size)
File "/usr/local/lib/python3.7/site-packages/astroid/context.py", line 113, in cache_generator
for result in generator:
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 131, in raise_if_nothing_inferred
yield next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 92, in wrapped
generator = _func(node, context, **kwargs)
File "/usr/local/lib/python3.7/site-packages/astroid/inference.py", line 832, in infer_assign
stmts = list(self.assigned_stmts(context=context))
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 131, in raise_if_nothing_inferred
yield next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/protocols.py", line 537, in with_assigned_stmts
yield from _infer_context_manager(self, mgr, context)
RuntimeError: generator raised StopIteration
From some search, it seems like this error should've been fixed with Pylint 2.x, however, I still get an error with the latest versions with Python 3.7. Any fixes?

inappropriate deploy Scrapy proxies

I got a error message, when I am scraping a profile. I assume I use my proxy wrong. But what is the main error here? Can you guys help
2017-06-15 21:35:17 [scrapy.proxies] INFO: Removing failed proxy
, 12 proxies left 2017-06-15
21:35:17 [scrapy.core.scraper] ERROR: Error downloading https://www.linkedin.com/in/jiajie-jacky-fan-80920083/> Traceback
(most recent call last): File
"/Users/jiajiefan/data_mining/lib/python2.7/site-packages/twisted/internet/defer.py",
line 1299, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/twisted/python/failure.py",
line 393, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/middleware.py",
line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider))) File
"/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/utils/defer.py",
line 45, in mustbe_deferred
result = f(*args, **kw) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/init.py",
line 65, in download_request
return handler.download_request(request, spider) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/http11.py",
line 63, in download_request
return agent.download_request(request) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/http11.py",
line 272, in download_request
agent = self._get_agent(request, timeout) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/http11.py",
line 252, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/webclient.py", line 37, in _parse
return _parsed_url_args(parsed) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/webclient.py", line 21, in _parsed_url_args
port = parsed.port File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urlparse.py",
line 113, in port
port = int(port, 10) ValueError: invalid literal fo
r int() with base 10: '178.32.255.199'
Proxy should has address with 'http', etc:
rq.meta['proxy'] = 'http://127.0.0.1:8123'

Can't use API with username and password in Scrapy

This Curl works.
https://<user>:<pass>#xecdapi.xe.com/v1/convert_from.json/?from=1000000&to=SGD&amount=AED,AUD,BDT&inverse=True
But this Scrapy request doesn't work.
yield scrapy.Request("https://<user>:<pass>#xecdapi.xe.com/v1/convert_from.json/?from=1000000&to=SGD&amount=AED,AUD,BDT&inverse=True")
It returns this error:
Traceback (most recent call last):
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\defer.py", line 1297, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\handlers\http11.py", line 61, in download_request
return agent.download_request(request)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\handlers\http11.py", line 286, in download_request
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\web\client.py", line 1596, in request
endpoint = self._getEndpoint(parsedURI)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\web\client.py", line 1580, in _getEndpoint
return self._endpointFactory.endpointForURI(uri)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\web\client.py", line 1456, in endpointForURI
uri.port)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\contextfactory.py", line 59, in creatorForNetloc
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext())
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\_sslverify.py", line 1201, in __init__
self._hostnameBytes = _idnaBytes(hostname)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\_sslverify.py", line 87, in _idnaBytes
return idna.encode(text)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\idna\core.py", line 355, in encode
result.append(alabel(label))
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\idna\core.py", line 276, in alabel
check_label(label)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\idna\core.py", line 253, in check_label
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
InvalidCodepoint: Codepoint U+003A at position 28 of u'xxxxxxxxxxxxxxxxxxxxxxxxxxxx:xxxxxxxxxxxxxxxxxxxxxxxxxxx#xecdapi' not allowed
Scrapy does not support HTTP Authentication via URL. We have to use HTTPAuthMiddleware instead.
in settings.py:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 811,
}
in the spider:
from scrapy.spiders import CrawlSpider
class SomeIntranetSiteSpider(CrawlSpider):
http_user = 'someuser'
http_pass = 'somepass'
name = 'intranet.example.com'
# .. rest of the spider code omitted ...

Categories

Resources