I'm working on an ocr project using scrapy using the ocr api at https://ocr.space/ocrapi . I have some code that works successfully using requests:
file_string = ctypes.string_at(image_data_pointer, length.value)
payload_filename = 'my_hires_image.jpg'
# Post payload as multipart encoded image file with filename.
# requests.post(THE_URL, files={'file': (payload_filename, payload)})
payload = {'isOverlayRequired': overlay,
'apikey': api_key,
'language': language,
r = requests.post('https://api.ocr.space/parse/image',
files={payload_filename: file_string},
data=payload,
)
return r.content.decode()
I'm now trying to turn this into a scrapy post request. I have:
payload_filename = 'my_hires_image.jpg'
# Post payload as multipart encoded image file with filename.
# requests.post(THE_URL, files={'file': (payload_filename, payload)})
body = {'file': file_string,
'isOverlayRequired': True,
'apikey': 'mykey',
'language': 'eng',
}
files = {payload_filename: file_string}
yield FormRequest(url='https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
please note that file_string is a byte string. You can see it in screenshot above. The code is giving me:
File "/\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "/\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "/\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "Emy_PROject/spiders\ocr_spider.py", line 148, in get_PDF
yield FormRequest(url='https://api.ocr.space/parse/image', headers=headers2, body=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
File "/\lib\site-packages\scrapy\http\request\form.py", line 27, in __init__
super(FormRequest, self).__init__(*args, **kwargs)
File "/\lib\site-packages\scrapy\http\request\__init__.py", line 26, in __init__
self._set_body(body)
File "/\lib\site-packages\scrapy\http\request\__init__.py", line 69, in _set_body
self._body = to_bytes(body, self.encoding)
File "/\lib\site-packages\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got dict
How can I get this working?
edit:
body = {'files':file_string,
'isOverlayRequired': True,
'apikey': '*******',
'language': 'eng',
}
body = urllib.parse.urlencode(body)
x = FormRequest('https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
yields:
File "....\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "....\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "....\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "....\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "....\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "myproject\spiders\ocr_spider.py", line 151, in get_PDF
x = FormRequest('https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
File "....\scrapy\http\request\form.py", line 31, in __init__
querystr = _urlencode(items, self.encoding)
File "....\scrapy\http\request\form.py", line 66, in _urlencode
for k, vs in seq
File "....\scrapy\http\request\form.py", line 65, in <listcomp>
values = [(to_bytes(k, enc), to_bytes(v, enc))
ValueError: not enough values to unpack (expected 2, got 1)
Related
I have built a scrapy prjoect which worked fine. Then in the process of making it into an .exe file, apparently I ruined something because it now gives the following error when ran from the IDE (PyCharm):
2023-02-02 20:41:14 [scrapy.core.scraper] ERROR: Spider error processing <GET https://ra.co/dj/Antigone/past-events> (referer: None)
Traceback (most recent call last):
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\axelz\Programmeren\RA_scrapy\rascraper\rascraper\spiders\spiderone.py", line 41, in parse
for post in response.css(''):
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\scrapy\http\response\text.py", line 141, in css
return self.selector.css(query)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\selector.py", line 456, in css
return self.xpath(self._css2xpath(query))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\selector.py", line 459, in _css2xpath
return self._csstranslator.css_to_xpath(query)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\parsel\csstranslator.py", line 104, in css_to_xpath
return super().css_to_xpath(css, prefix)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\xpath.py", line 224, in css_to_xpath
for selector in parse(css)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 543, in parse
return list(parse_selector_group(stream))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 558, in parse_selector_group
yield Selector(*parse_selector(stream))
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 567, in parse_selector
result, pseudo_element = parse_simple_selector(stream)
File "C:\Users\axelz\Programmeren\RA_scrapy\venv\lib\site-packages\cssselect\parser.py", line 702, in parse_simple_selector
raise SelectorSyntaxError("Expected selector, got %s" % (stream.peek(),))
cssselect.parser.SelectorSyntaxError: Expected selector, got <EOF at 0>
2023-02-02 20:41:14 [scrapy.core.engine] INFO: Closing spider (finished)
I have tried really hard, but have no idea what the actual problem is.
Can anyone point me in the right direction?
i am trying to get last row of column plate and append data to it. But it gives corrupt file error even though scrapy is working properly.
I guess error is due to lines below. Where I firstly, use pandas ExcelWriter object, then for getting last row I use dataframe.
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
and variable lastRow is unassigned, as I guess. That's why it does not give a value to to_excel method
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
class plateScraper(scrapy.Spider):
name = 'scrapePlate'
allowed_domains = ['dvlaregistrations.dvla.gov.uk']
def start_requests(self):
df=pd.read_excel('data.xlsx')
columnA_values=df['PLATE']
for row in columnA_values:
global plate_num_xlsx
plate_num_xlsx=row
base_url =f"https://dvlaregistrations.dvla.gov.uk/search/results.html?search={plate_num_xlsx}&action=index&pricefrom=0&priceto=&prefixmatches=¤tmatches=&limitprefix=&limitcurrent=&limitauction=&searched=true&openoption=&language=en&prefix2=Search&super=&super_pricefrom=&super_priceto="
url=base_url
yield scrapy.Request(url)
def parse(self, response):
itemList=[]
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate.strip(), "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_last=pd.DataFrame('output_res.xlsx')
lastRow=df_last['plate'].iget(-1)
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True,startrow=lastRow)
process = CrawlerProcess()
process.crawl(plateScraper)
process.start()
gives an error
Traceback (most recent call last):
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\defer.py", line 240, in iter_errback
yield next(it)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\utils\python.py", line 338, in __next__
return next(self.data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 336, in <genexpr>
return (self._set_referer(r, response) for r in result or ())
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 28, in <genexpr>
return (r for r in result or () if self._filter(r, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 32, in <genexpr>
return (r for r in result or () if self._filter(r, response, spider))
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\scrapy\core\spidermw.py", line 79, in process_sync
for r in iterable:
File "C:\pythonPro\w_crawl\SimonDarak\scrpy_00.py", line 33, in parse
with pd.ExcelWriter('output_res.xlsx', mode='a',if_sheet_exists='overlay') as writer:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel\_openpyxl.py", line 73, in __init__
self._book = load_workbook(self._handles.handle, **engine_kwargs)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 317, in load_workbook
reader.read()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 282, in read
self.read_worksheets()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\reader\excel.py", line 228, in read_worksheets
ws_parser.bind_all()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 448, in bind_all
self.bind_cells()
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 351, in bind_cells
for idx, row in self.parser.parse():
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\site-packages\openpyxl\worksheet\_reader.py", line 144, in parse
for _, element in it:
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\xml\etree\ElementTree.py", line 1255, in iterator
data = source.read(16 * 1024)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 925, in read
data = self._read1(n)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 1015, in _read1
self._update_crc(data)
File "C:\Users\Admin\AppData\Local\Programs\Python\Python310\lib\zipfile.py", line 943, in _update_crc
raise BadZipFile("Bad CRC-32 for file %r" % self.name)
zipfile.BadZipFile: Bad CRC-32 for file 'xl/worksheets/sheet1.xml'
Process finished with exit code -1
I take list from parse method and put it outside class.
itemList=[]
def parse(self, response):
for row in response.css('div.resultsstrip'):
plate = row.css('a::text').get()
price = row.css('p::text').get()
a = plate.replace(" ", "").strip()
print(plate_num_xlsx,a,a == plate_num_xlsx)
if plate_num_xlsx==plate.replace(" ","").strip():
item= {"plate": plate.strip(), "price": price.strip()}
itemList.append(item)
yield item
else:
item = {"plate": plate_num_xlsx, "price": "-"}
itemList.append(item)
yield item
with pd.ExcelWriter('output_res.xlsx', mode='r+',if_sheet_exists='overlay') as writer:
df_output = pd.DataFrame(itemList)
df_output.to_excel(writer, sheet_name='result', index=False, header=True)
Below is my Code in raspberry PI's python(Thonny Idle).
Kindly Ignore the Url, it is not the real address.
Code
from firebase import firebase
firebase = firebase.FirebaseApplication('https://testing123123-iot.firebaseio.com',authentication=None)
data = {
'Name':'Hi',
'Email':'hihi.com',
'Phone':512232131
}
result = firebase.post('/testing123123-iot:/Customer', data)
print(result)
Error
Traceback (most recent call last):
File "/home/pi/Documents/PythonCode/TestingFirebase-1.py", line 17, in
result = firebase.post('/testing-iot:/Customer', data)
File "/usr/local/lib/python3.7/dist-packages/firebase/decorators.py", line 19, in wrapped
return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/firebase/firebase.py", line 329, in post
connection=connection)
File "/usr/local/lib/python3.7/dist-packages/firebase/decorators.py", line 19, in wrapped
return f(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/firebase/firebase.py", line 97, in make_post_request
timeout=timeout)
File "/usr/local/lib/python3.7/dist-packages/requests/sessions.py", line 340, in post
return self.request('POST', url, data=data, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/requests/sessions.py", line 279, in request
resp = self.send(prep, stream=stream, timeout=timeout, verify=verify, cert=cert, proxies=proxies)
File "/usr/local/lib/python3.7/dist-packages/requests/sessions.py", line 374, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/requests/adapters.py", line 174, in send
timeout=timeout
File "/usr/local/lib/python3.7/dist-packages/requests/packages/urllib3/connectionpool.py", line 417, in urlopen
conn = self._get_conn(timeout=pool_timeout)
File "/usr/local/lib/python3.7/dist-packages/requests/packages/urllib3/connectionpool.py", line 232, in _get_conn
return conn or self._new_conn()
File "/usr/local/lib/python3.7/dist-packages/requests/packages/urllib3/connectionpool.py", line 547, in _new_conn
strict=self.strict)
TypeError: init() got an unexpected keyword argument 'strict'
use json.dumps :
import json
data = {
'Name':'Hi',
'Email':'hihi.com',
'Phone':512232131
}
sent = json.dumps(data)
result = firebase.post('/testing123123-iot:/Customer', sent)
print(result)
i do not quite get while i kept getting Index error that says out of range here. Is it because the list link is empty? What is the format of writing link here
class POSpider(CrawlSpider):
name = 'po'
start_urls = ['https://www.poets.org/poetsorg/poems']
allowed_domains = ['poets.org/poetsorg/poems']
def parse(self, response):
items=[]
l=response.xpath('//*[#class="themes"]//a//#href').extract()
theme_ids=[]
for item in l:
theme_ids.append(item[855:1412])
theme_urls=[]
for tid in theme_ids:
theme_urls.append('https://www.poets.org/poetsorg/poems? field_occasion_tid=All&field_poem_themes_tid='+ tid)
for link in theme_urls:
request=scrapy.Request(link,callback=self.parse_layer2,dont_filter=True)
yield request
def parse_layer2(self,response):
items=[]
p=response.xpath('//*[#id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//#href')[-1].extract()
poem_urls=[]
for item in p:
poem_urls.append(item)
for link in poem_urls:
request=scrapy.Request(link,callback=self.parse_layer3,dont_filter=True)
yield request
def parse_layer3(self,response):
items=[]
poems=response.xpath('//*[#id="poem-content"]/div[2]/div/div').extract()
for poem in poems:
item=PoetryItem()
s=poem.xpath('*/p/text()').extract()
t=strip_list(s)
t=t.encode('ascii','replace').lower()+'\r\n'
item['poem']=t
items.append(item)
return items
and this is what i kept getting as an result.
Traceback (most recent call last):
File "//anaconda/lib/python2.7/site-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "//anaconda/lib/python2.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/chinouhane/Desktop/Org/Org/spiders/PO_spider.py", line 37, in parse_layer2
p=response.xpath('//*[#id="block-views-poems-poems-block-all"]/div/div//tbody//td[2]//#href')[-1].extract()
File "//anaconda/lib/python2.7/site-packages/parsel/selector.py", line 56, in __getitem__
o = super(SelectorList, self).__getitem__(pos)
IndexError: list index out of range
I'm working with scrapy. I want to rotate proxies on a per request basis and get a proxy from an api I have that returns a single proxy. My plan is to make a request to the api, get a proxy, then use it to set the proxy based on :
http://stackoverflow.com/questions/39430454/making-request-to-api-from-within-scrapy-function
I have the following:
class ContactSpider(Spider):
name = "contact"
def parse(self, response):
....
PR = Request(
'my_api'
headers=self.headers,
meta={'newrequest': Request(url_to_scrape, headers=self.headers),},
callback=self.parse_PR
)
yield PR
def parse_PR(self, response):
newrequest = response.meta['newrequest']
proxy_data = response.body
newrequest.meta['proxy'] = 'http://'+proxy_data
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING
yield newrequest
def form_output(self, response):
open_in_browser(response)
but I'm getting:
Traceback (most recent call last):
File "C:\twisted\internet\defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "C:\scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "C:\scrapy\core\downloader\handlers\http11.py", line 60, in download_request
return agent.download_request(request)
File "C:\scrapy\core\downloader\handlers\http11.py", line 255, in download_request
agent = self._get_agent(request, timeout)
File "C:\scrapy\core\downloader\handlers\http11.py", line 235, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
File "C:\scrapy\core\downloader\webclient.py", line 37, in _parse
return _parsed_url_args(parsed)
File "C:\scrapy\core\downloader\webclient.py", line 20, in _parsed_url_args
host = b(parsed.hostname)
File "C:\scrapy\core\downloader\webclient.py", line 17, in <lambda>
b = lambda s: to_bytes(s, encoding='ascii')
File "C:\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got NoneType
what am I doing wrong?
The stacktrace info suggests Scrapy has encountered a request object whose url is None, which is expected to be of string type.
These two lines in your code:
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING
would not work as expected, since method Request.replace returns a new instance instead of modifying the original request in-place.
You would need something like this:
newrequest = newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest = newrequest.replace(callback= self.form_output) #TESTING
or simply:
newrequest = newrequest.replace(
url='http://ipinfo.io/ip',
callback=self.form_output
)