I have built a scrapy prjoect which worked fine. Then in the process of making it into an .exe file, apparently I ruined something because it now gives the following error when ran from the IDE (PyCharm):
2023-02-02 20:41:14 [scrapy.core.scraper] ERROR: Spider error processing <GET https://ra.co/dj/Antigone/past-events> (referer: None)
Traceback (most recent call last):
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\rascraper\rascraper\spiders\spiderone.py", line 41, in parse
for post in response.css(''):
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\http\response\text.py", line 141, in css
return self.selector.css(query)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\selector.py", line 456, in css
return self.xpath(self._css2xpath(query))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\selector.py", line 459, in _css2xpath
return self._csstranslator.css_to_xpath(query)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\csstranslator.py", line 104, in css_to_xpath
return super().css_to_xpath(css, prefix)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\xpath.py", line 224, in css_to_xpath
for selector in parse(css)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 543, in parse
return list(parse_selector_group(stream))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 558, in parse_selector_group
yield Selector(*parse_selector(stream))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 567, in parse_selector
result, pseudo_element = parse_simple_selector(stream)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 702, in parse_simple_selector
raise SelectorSyntaxError("Expected selector, got %s" % (stream.peek(),))
cssselect.parser.SelectorSyntaxError: Expected selector, got <EOF at 0>
2023-02-02 20:41:14 [scrapy.core.engine] INFO: Closing spider (finished)
I have tried really hard, but have no idea what the actual problem is.
Can anyone point me in the right direction?
Related
i am trying to get last row of column plate and append data to it. But it gives corrupt file error even though scrapy is working properly.
I guess error is due to lines below. Where I firstly, use pandas ExcelWriter object, then for getting last row I use dataframe.
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
and variable lastRow is unassigned, as I guess. That's why it does not give a value to to_excel method
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
class plateScraper(scrapy.Spider):
name = 'scrapePlate'
allowed_domains = ['dvlaregistrations.dvla.gov.uk']
def start_requests(self):
df=pd.read_excel('data.xlsx')
columnA_values=df['PLATE']
for row in columnA_values:
global plate_num_xlsx
plate_num_xlsx=row
base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=¤tmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
url=base_url
yield scrapy.Request(url)
def parse(self, response):
itemList=[]
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate.strip(), "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
process = CrawlerProcess()
process.crawl(plateScraper)
process.start()
gives an error
Traceback (most recent call last):
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\pythonPro\w_crawl\SimonDarak\scrpy_00.py", line 33, in parse
with pd.ExcelWriter('output_res.xlsx', mode='a',if_sheet_exists='overlay') as writer:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel\_openpyxl.py", line 73, in __init__
self._book = load_workbook(self._handles.handle, **engine_kwargs)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 317, in load_workbook
reader.read()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 282, in read
self.read_worksheets()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 228, in read_worksheets
ws_parser.bind_all()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 448, in bind_all
self.bind_cells()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 351, in bind_cells
for idx, row in self.parser.parse():
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 144, in parse
for _, element in it:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\xml\etree\ElementTree.py", line 1255, in iterator
data = source.read(16 * 1024)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 925, in read
data = self._read1(n)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1015, in _read1
self._update_crc(data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 943, in _update_crc
raise BadZipFile("Bad CRC-32 for file %r" % self.name)
zipfile.BadZipFile: Bad CRC-32 for file 'xl/worksheets/sheet1.xml'
Process finished with exit code -1
I take list from parse method and put it outside class.
itemList=[]
def parse(self, response):
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
a = plate.replace(" ", "").strip()
print(plate_num_xlsx,a,a == plate_num_xlsx)
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate_num_xlsx, "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True)
I am trying to send raw transaction by web3py using this code:
t = w3.eth.account.sign_transaction(test_contract.functions.edit("test").buildTransaction(
{
"nonce": w3.eth.get_transaction_count(w3.eth.default_account)
}
), pkey)
w3.eth.send_raw_transaction(t)
But, where python comes to the last line, I have this error in console:
Traceback (most recent call last):
File "***/main.py", line 64, in <module>
w3.eth.send_raw_transaction(t)
File "***/venv/lib/python3.9/site-packages/web3/module.py", line 53, in caller
(method_str, params), response_formatters = method.process_params(module, *args, **kwargs) # noqa: E501
File "***/venv/lib/python3.9/site-packages/web3/method.py", line 194, in process_params
_apply_request_formatters(params, self.request_formatters(method)))
File "***/venv/lib/python3.9/site-packages/eth_utils/functional.py", line 45, in inner
return callback(fn(*args, **kwargs))
File "***/venv/lib/python3.9/site-packages/web3/method.py", line 50, in _apply_request_formatters
formatted_params = pipe(params, request_formatters)
File "cytoolz/functoolz.pyx", line 667, in cytoolz.functoolz.pipe
File "cytoolz/functoolz.pyx", line 642, in cytoolz.functoolz.c_pipe
File "cytoolz/functoolz.pyx", line 254, in cytoolz.functoolz.curry.__call__
File "cytoolz/functoolz.pyx", line 250, in cytoolz.functoolz.curry.__call__
File "***/venv/lib/python3.9/site-packages/web3/_utils/abi.py", line 799, in map_abi_data
return pipe(data, *pipeline)
File "cytoolz/functoolz.pyx", line 667, in cytoolz.functoolz.pipe
File "cytoolz/functoolz.pyx", line 642, in cytoolz.functoolz.c_pipe
File "cytoolz/functoolz.pyx", line 254, in cytoolz.functoolz.curry.__call__
File "cytoolz/functoolz.pyx", line 250, in cytoolz.functoolz.curry.__call__
File "***/venv/lib/python3.9/site-packages/web3/_utils/abi.py", line 833, in data_tree_map
return recursive_map(map_to_typed_data, data_tree)
File "***/venv/lib/python3.9/site-packages/web3/_utils/decorators.py", line 30, in wrapped
wrapped_val = to_wrap(*args)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 89, in recursive_map
items_mapped = map_collection(recurse, data)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 76, in map_collection
return datatype(map(func, collection))
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 88, in recurse
return recursive_map(func, item)
File "***/venv/lib/python3.9/site-packages/web3/_utils/decorators.py", line 30, in wrapped
wrapped_val = to_wrap(*args)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 89, in recursive_map
items_mapped = map_collection(recurse, data)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 76, in map_collection
return datatype(map(func, collection))
File "***/venv/lib/python3.9/site-packages/web3/_utils/abi.py", line 855, in __new__
return super().__new__(cls, *iterable)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 88, in recurse
return recursive_map(func, item)
File "***/venv/lib/python3.9/site-packages/web3/_utils/decorators.py", line 30, in wrapped
wrapped_val = to_wrap(*args)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 89, in recursive_map
items_mapped = map_collection(recurse, data)
File "***/venv/lib/python3.9/site-packages/web3/_utils/formatters.py", line 76, in map_collection
return datatype(map(func, collection))
TypeError: <lambda>() missing 4 required positional arguments: 'hash', 'r', 's', and 'v'
I am using infura custom node, that's why I cant send transaction by contract.functions.method.transact(). Don't know to do with this error, spent a lot of time reading docs and got nothing.
How can I fix that?
sign_transaction returns SignedTransaction object while send_raw_transaction accepts raw transaction bytes. So change your last line to:
w3.eth.send_raw_transaction(t.rawTransaction)
You also probably want to save the result to a variable to track the transaction later.
You need to sign the transaction before sending with your account that has ETH balance.
You need to use Signing middleware.
>>> from web3 import Web3, EthereumTesterProvider
>>> w3 = Web3(EthereumTesterProvider)
>>> from web3.middleware import construct_sign_and_send_raw_middleware
>>> from eth_account import Account
>>> acct = Account.create('KEYSMASH FJAFJKLDSKF7JKFDJ 1530')
>>> w3.middleware_onion.add(construct_sign_and_send_raw_middleware(acct))
>>> w3.eth.default_account = acct.address
# Now you can send a tx from acct.address without having to build and sign each raw transaction
Here are my package versions,
$ pylint --version
pylint 2.3.1
astroid 2.2.5
Python 3.7.4 (default, Aug 14 2019, 12:09:51)
[GCC 8.3.0]
When I run
pylint {package_name}
I get a RuntimeError as shown below:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/site-packages/astroid/protocols.py", line 492, in _infer_context_manager
enter = next(inferred.igetattr("__enter__", context=context))
StopIteration
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/bin/pylint", line 10, in <module>
sys.exit(run_pylint())
File "/usr/local/lib/python3.7/site-packages/pylint/__init__.py", line 20, in run_pylint
Run(sys.argv[1:])
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 1628, in __init__
linter.check(args)
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 943, in check
self._do_check(files_or_modules)
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 1075, in _do_check
self.check_astroid_module(ast_node, walker, rawcheckers, tokencheckers)
File "/usr/local/lib/python3.7/site-packages/pylint/lint.py", line 1158, in check_astroid_module
walker.walk(ast_node)
File "/usr/local/lib/python3.7/site-packages/pylint/utils.py", line 1303, in walk
self.walk(child)
File "/usr/local/lib/python3.7/site-packages/pylint/utils.py", line 1300, in walk
cb(astroid)
File "/usr/local/lib/python3.7/site-packages/pylint/checkers/variables.py", line 1590, in visit_import
module = next(_infer_name_module(node, parts[0]))
File "/usr/local/lib/python3.7/site-packages/astroid/util.py", line 160, in limit_inference
yield from islice(iterator, size)
File "/usr/local/lib/python3.7/site-packages/astroid/context.py", line 113, in cache_generator
for result in generator:
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 131, in raise_if_nothing_inferred
yield next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 95, in wrapped
res = next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/inference.py", line 240, in infer_import
yield self.do_import_module(name)
File "/usr/local/lib/python3.7/site-packages/astroid/mixins.py", line 100, in do_import_module
modname, level=level, relative_only=level and level >= 1
File "/usr/local/lib/python3.7/site-packages/astroid/scoped_nodes.py", line 619, in import_module
return MANAGER.ast_from_module_name(absmodname)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 171, in ast_from_module_name
return self.ast_from_file(found_spec.location, modname, fallback=False)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 91, in ast_from_file
return AstroidBuilder(self).file_build(filepath, modname)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 136, in file_build
return self._post_build(module, encoding)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 153, in _post_build
self.add_from_names_to_locals(from_node)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 206, in add_from_names_to_locals
imported = node.do_import_module()
File "/usr/local/lib/python3.7/site-packages/astroid/mixins.py", line 100, in do_import_module
modname, level=level, relative_only=level and level >= 1
File "/usr/local/lib/python3.7/site-packages/astroid/scoped_nodes.py", line 619, in import_module
return MANAGER.ast_from_module_name(absmodname)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 171, in ast_from_module_name
return self.ast_from_file(found_spec.location, modname, fallback=False)
File "/usr/local/lib/python3.7/site-packages/astroid/manager.py", line 91, in ast_from_file
return AstroidBuilder(self).file_build(filepath, modname)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 136, in file_build
return self._post_build(module, encoding)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 156, in _post_build
self.delayed_assattr(delayed)
File "/usr/local/lib/python3.7/site-packages/astroid/builder.py", line 223, in delayed_assattr
for inferred in node.expr.infer():
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 141, in raise_if_nothing_inferred
yield from generator
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 95, in wrapped
res = next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/inference.py", line 279, in infer_attribute
for owner in self.expr.infer(context):
File "/usr/local/lib/python3.7/site-packages/astroid/util.py", line 160, in limit_inference
yield from islice(iterator, size)
File "/usr/local/lib/python3.7/site-packages/astroid/context.py", line 113, in cache_generator
for result in generator:
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 141, in raise_if_nothing_inferred
yield from generator
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 95, in wrapped
res = next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/bases.py", line 137, in _infer_stmts
for inferred in stmt.infer(context=context):
File "/usr/local/lib/python3.7/site-packages/astroid/util.py", line 160, in limit_inference
yield from islice(iterator, size)
File "/usr/local/lib/python3.7/site-packages/astroid/context.py", line 113, in cache_generator
for result in generator:
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 131, in raise_if_nothing_inferred
yield next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 92, in wrapped
generator = _func(node, context, **kwargs)
File "/usr/local/lib/python3.7/site-packages/astroid/inference.py", line 832, in infer_assign
stmts = list(self.assigned_stmts(context=context))
File "/usr/local/lib/python3.7/site-packages/astroid/decorators.py", line 131, in raise_if_nothing_inferred
yield next(generator)
File "/usr/local/lib/python3.7/site-packages/astroid/protocols.py", line 537, in with_assigned_stmts
yield from _infer_context_manager(self, mgr, context)
RuntimeError: generator raised StopIteration
From some search, it seems like this error should've been fixed with Pylint 2.x, however, I still get an error with the latest versions with Python 3.7. Any fixes?
I got a error message, when I am scraping a profile. I assume I use my proxy wrong. But what is the main error here? Can you guys help
2017-06-15 21:35:17 [scrapy.proxies] INFO: Removing failed proxy
, 12 proxies left 2017-06-15
21:35:17 [scrapy.core.scraper] ERROR: Error downloading https://www.linkedin.com/in/jiajie-jacky-fan-80920083/> Traceback
(most recent call last): File
"/Users/jiajiefan/data_mining/lib/python2.7/site-packages/twisted/internet/defer.py",
line 1299, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/twisted/python/failure.py",
line 393, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/middleware.py",
line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider))) File
"/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/utils/defer.py",
line 45, in mustbe_deferred
result = f(*args, **kw) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/init.py",
line 65, in download_request
return handler.download_request(request, spider) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/http11.py",
line 63, in download_request
return agent.download_request(request) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/http11.py",
line 272, in download_request
agent = self._get_agent(request, timeout) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/handlers/http11.py",
line 252, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/webclient.py", line 37, in _parse
return _parsed_url_args(parsed) File "/Users/jiajiefan/data_mining/lib/python2.7/site-packages/Scrapy-1.4.0-py2.7.egg/scrapy/core/downloader/webclient.py", line 21, in _parsed_url_args
port = parsed.port File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urlparse.py",
line 113, in port
port = int(port, 10) ValueError: invalid literal fo
r int() with base 10: '178.32.255.199'
Proxy should has address with 'http', etc:
rq.meta['proxy'] = 'http://127.0.0.1:8123'
This Curl works.
https://<user>:<pass>#xecdapi.xe.com/v1/convert_from.json/?from=1000000&to=SGD&amount=AED,AUD,BDT&inverse=True
But this Scrapy request doesn't work.
yield scrapy.Request("https://<user>:<pass>#xecdapi.xe.com/v1/convert_from.json/?from=1000000&to=SGD&amount=AED,AUD,BDT&inverse=True")
It returns this error:
Traceback (most recent call last):
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\defer.py", line 1297, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\handlers\http11.py", line 61, in download_request
return agent.download_request(request)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\handlers\http11.py", line 286, in download_request
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\web\client.py", line 1596, in request
endpoint = self._getEndpoint(parsedURI)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\web\client.py", line 1580, in _getEndpoint
return self._endpointFactory.endpointForURI(uri)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\web\client.py", line 1456, in endpointForURI
uri.port)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\scrapy\core\downloader\contextfactory.py", line 59, in creatorForNetloc
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext())
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\_sslverify.py", line 1201, in __init__
self._hostnameBytes = _idnaBytes(hostname)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\twisted\internet\_sslverify.py", line 87, in _idnaBytes
return idna.encode(text)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\idna\core.py", line 355, in encode
result.append(alabel(label))
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\idna\core.py", line 276, in alabel
check_label(label)
File "d:\kerja\hit\python~1\<project_name>\<project_name>\lib\site-packages\idna\core.py", line 253, in check_label
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
InvalidCodepoint: Codepoint U+003A at position 28 of u'xxxxxxxxxxxxxxxxxxxxxxxxxxxx:xxxxxxxxxxxxxxxxxxxxxxxxxxx#xecdapi' not allowed
Scrapy does not support HTTP Authentication via URL. We have to use HTTPAuthMiddleware instead.
in settings.py:
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 811,
}
in the spider:
from scrapy.spiders import CrawlSpider
class SomeIntranetSiteSpider(CrawlSpider):
http_user = 'someuser'
http_pass = 'somepass'
name = 'intranet.example.com'
# .. rest of the spider code omitted ...