Read pdf and find next word after a key word - python

I am trying to run the following code in order to search for the next words of a text that come after "examination".
Input is a pdf that i try to convert to a text using tinka.
Somehow the code throws an error referred to tinka that i do not understand.
Does anyone know how to fix it or knows another way to implement my problem?
import re
from tika import parser
raw = parser.from_file('application0001.pdf')
print(raw['content'])
list_of_words = raw.split()
search="examination"
next_word = list_of_words[list_of_words.index(search) + 1]
print(next_word)
This is the error I get when running it and I do not get what it means.
2019-05-24 09:53:53,217 [MainThread ] [INFO ] Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /var/folders/xn/p33pzhs179n33z55z66lqcn00000gn/T/tika-server.jar.
Traceback (most recent call last):
File "/Users/Mauritius/anaconda3/lib/python3.6/site-packages/tika/tika.py", line 716, in getRemoteJar
urlretrieve(urlOrPath, destPath)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 503: Service Unavailable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/Mauritius/Desktop/text_search.py", line 7, in <module>
raw = parser.from_file('application0001.pdf')
File "/Users/Mauritius/anaconda3/lib/python3.6/site-packages/tika/parser.py", line 36, in from_file
jsonOutput = parse1('all', filename, serverEndpoint, headers=headers, config_path=config_path)
File "/Users/Mauritius/anaconda3/lib/python3.6/site-packages/tika/tika.py", line 328, in parse1
headers, verbose, tikaServerJar, config_path=config_path, rawResponse=rawResponse)
File "/Users/Mauritius/anaconda3/lib/python3.6/site-packages/tika/tika.py", line 522, in callServer
serverEndpoint = checkTikaServer(scheme, serverHost, port, tikaServerJar, classpath, config_path)
File "/Users/Mauritius/anaconda3/lib/python3.6/site-packages/tika/tika.py", line 571, in checkTikaServer
getRemoteJar(tikaServerJar, jarPath)
File "/Users/Mauritius/anaconda3/lib/python3.6/site-packages/tika/tika.py", line 726, in getRemoteJar
urlretrieve(urlOrPath, destPath)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 570, in error
return self._call_chain(*args)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/Users/Mauritius/anaconda3/lib/python3.6/urllib/request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 503: Service Unavailable
[Finished in 2.7s with exit code 1]
[shell_cmd: python -u "/Users/Mauritius/Desktop/text_search.py"]
[dir: /Users/Mauritius/Desktop]
[path: /Users/Mauritius/miniconda3/bin:/opt/local/bin:/opt/local/sbin:/Users/Mauritius/anaconda3/bin:/Library/Frameworks/Python.framework/Versions/3.5/bin://anaconda/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Library/TeX/texbin]

Related

Raise HTTP Error 406: Not Acceptable when import SeqVecEmbedder from bio_embeddings.embed

I am trying to process protein sequence by using bio_embeddings, but it raise "HTTP Error 406: Not Acceptable" when import SeqVecEmbedder from bio_embeddings.embed.
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\site-packages\bio_embeddings\embed\__init__.py", line 31, in __init__
get_model_file(path=f.name, model='seqvecv{}'.format(str(kwargs.get('seqvec_version', 1))), file=file)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\site-packages\bio_embeddings\utilities\remote_file_retriever.py", line 72, in get_model_file
request.urlretrieve(url, filename=path, reporthook=t.update_to)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 247, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 531, in open
response = meth(req, response)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 641, in http_response
'http', request, response, code, msg, hdrs)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 569, in error
return self._call_chain(*args)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 503, in _call_chain
result = func(*args)
File "C:\ProgramData\Miniconda3\envs\bio_e\lib\urllib\request.py", line 649, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 406: Not Acceptable
That is my code
from bio_embeddings.embed import SeqVecEmbedder
import torch
seq = 'MVTYDFGSDEMHD'
embedder = SeqVecEmbedder()
embedding = embedder.embed(seq)
I am trying to use Bio_embeddings to process protein sequence.

How to solve the 410:Gone error in python

I am trying to make a program that searches amazon using the amazon simple-product API but it is showing some HTTP 410:Gone error.
Here is the code.
from amazon.api import AmazonAPI
amazon = AmazonAPI('A********************A', 'X**************************m',
'1*******************0')
a=input(':')
results = amazon.search(Keywords = a, SearchIndex = "Books")
for item in results:
print (item.title, item.isbn, item.price_and_currency)
Now this is the error
Traceback (most recent call last):
File "C:\Users\susheel\Desktop\booksearch.py", line 12, in <module>
for item in results:
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\site-packages\amazon\api.py", line
544, in __iter__
for page in self.iterate_pages():
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\site-packages\amazon\api.py", line
561, in iterate_pages
yield self._query(ItemPage=self.current_page, **self.kwargs)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\site-packages\amazon\api.py", line
573, in _query
response = self.api.ItemSearch(ResponseGroup=ResponseGroup, **kwargs)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\site-packages\bottlenose\api.py",
line 273, in __call__
response = self._call_api(api_url,
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\site-packages\bottlenose\api.py",
line 235, in _call_api
return urllib2.urlopen(api_request, timeout=self.Timeout)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 222, in
urlopen
return opener.open(url, data, timeout)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 531, in
open
response = meth(req, response)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 640, in
http_response
response = self.parent.error(
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 569, in
error
return self._call_chain(*args)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 502, in
_call_chain
result = func(*args)
File "C:\Users\susheel\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 649, in
http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 410: Gone
Please help me out.

python urlopen returns error

I am trying to parse some data from 'https://datausa.io/profile/geo/jacksonville-fl/#intro', but I am not sure how to access it from python. My code is:
adress, headers = urllib.request.urlretrieve(' https://datausa.io/profile/geo/jacksonville-fl/#intro')
handle = open(adress)
and it returns the error:
Traceback (most recent call last):
File "C:/Users/Jared/AppData/Local/Programs/Python/Python36-32/capstone1.py", line 16, in <module>
adress, headers = urllib.request.urlretrieve(' https://datausa.io/profile/geo/jacksonville-fl/#intro')
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 248, in urlretrieve
with contextlib.closing(urlopen(url, data)) as fp:
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\Jared\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
Please explain what is wrong or tell me a better way to access the page. Also, does the ' .io ' suffix affecthow python handles it?
Thanks.
This worked for me:
import requests
url = "https://datausa.io/profile/geo/jacksonville-fl/#intro"
req = requests.request("GET",url)

urllib.error.HTTPError: HTTP Error 403: Forbidden when using untangle python 3.6

This is my first attempt with python, Im trying to use an external library for xml parsing for python 3.6.
I'm getting an error which doesn't seem to have anything to do with my code, and I can't figure out what the problem is from the error output
my code:
import untangle
x = untangle.parse(r"C:\file.xml")
error:
Traceback (most recent call last):
File "C:/Project/Main.py", line 2, in <module>
x = untangle.parse(r"C:\file.xml")
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\site-packages\untangle.py", line 177, in parse
parser.parse(filename)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\xml\sax\expatreader.py", line 111, in parse
xmlreader.IncrementalParser.parse(self, source)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\xml\sax\xmlreader.py", line 125, in parse
self.feed(buffer)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\xml\sax\expatreader.py", line 217, in feed
self._parser.Parse(data, isFinal)
File "..\Modules\pyexpat.c", line 668, in ExternalEntityRef
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\xml\sax\expatreader.py", line 413, in external_entity_ref
"")
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\xml\sax\saxutils.py", line 364, in prepare_input_source
f = urllib.request.urlopen(source.getSystemId())
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\user\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

urllib.error.HTTPError: HTTP Error 404: NOT FOUND

I'm trying to run a script to grab dada from an API and feed to a mongodb database. And I keep getting this error but if I test one sing url it works.
Any suggestions?
full script http://pastebin.com/Xap5vYYC
Traceback (most recent call last):
File "hmsParser.py", line 112, in <module>
smReply = urlopen(smUrl).read().decode("utf8")
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 162, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 471, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 581, in http_response
'http', request, response, code, msg, hdrs)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 509, in error
return self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 443, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/urllib/request.py", line 589, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: NOT FOUND

Categories

Resources