I wanted to extract drug class using Rxnorm API (RxNorm API) using NDC code. My python codes are:
#!/usr/bin/python
#pip install simplejson
import os
import sys
import requests
import simplejson as json
def connectionCheck():
url = 'http://rxnav.nlm.nih.gov/REST/version'
header = {'Accept': 'application/json'}
getCheck = requests.get(url, headers=header)
if getCheck.status_code != requests.codes.ok:
response = "RXNorm server response error. Response code: %s" % getCheck.status_code
else:
response = "Connection check complete. RXNorm online. Response code: %s" % getCheck.status_code
return response
def rxNorm(ndc):
# ndc value coming from master.py
# ndc = [array of ndc values]
if ndc[0] is None:
return {"rxcui": "", "rxtty": "", "rxstring": ""}
else:
# if internet or request throws an error, print out to check connection and exit
try:
baseurl = 'http://rxnav.nlm.nih.gov/REST/'
# Searching RXNorm API, Search by identifier to find RxNorm concepts
# http://rxnav.nlm.nih.gov/REST/rxcui?idtype=NDC&id=0591-2234-10
# Set url parameters for searching RXNorm for SETID
ndcSearch = 'rxcui?idtype=NDC&id='
# Search RXNorm API, Return all properties for a concept
rxPropSearch = 'rxcui/'
rxttySearch = '/property?propName=TTY'
rxstringSearch = '/property?propName=RxNorm%20Name'
# Request RXNorm API to return json
header = {'Accept': 'application/json'}
def getTTY(rxCUI):
# Search RXNorm again using RXCUI to return RXTTY & RXSTRING
getTTY = requests.get(baseurl+rxPropSearch+rxCUI+rxttySearch, headers=header)
ttyJSON = json.loads(getTTY.text, encoding="utf-8")
return ttyJSON['propConceptGroup']['propConcept'][0]['propValue']
def getSTRING(rxCUI):
# Search RXNorm again using RXCUI to return RXTTY & RXSTRING
getString = requests.get(baseurl+rxPropSearch+rxCUI+rxstringSearch, headers=header)
stringJSON = json.loads(getString.text, encoding="utf-8")
return stringJSON['propConceptGroup']['propConcept'][0]['propValue']
# Search RXNorm using NDC code, return RXCUI id
# ndc = [ndc1, ndc2, ... ]
for item in ndc:
getRXCUI = requests.get(baseurl+ndcSearch+item, headers=header)
if getRXCUI.status_code != requests.codes.ok:
print ("RXNorm server response error. Response code: %s" % getRXCUI.status_code)
rxcuiJSON = json.loads(getRXCUI.text, encoding="utf-8")
# Check if first value in list returns a RXCUI, if not go to next value
try:
if rxcuiJSON['idGroup']['rxnormId']:
rxCUI = rxcuiJSON['idGroup']['rxnormId'][0]
rxTTY = getTTY(rxCUI)
rxSTRING = getSTRING(rxCUI)
return {"rxcui": rxCUI, "rxtty": rxTTY, "rxstring": rxSTRING}
except:
# if last item return null values
if item == ndc[-1]:
return {"rxcui": "", "rxtty": "", "rxstring": ""}
pass
except:
sys.exit("RXNorm connection")
Test using Toy NDC ID Code:
dataTest=rxNorm(['69238131109'])
print(dataTest)
which gave me the following output:
{'rxcui': '483448', 'rxtty': 'SCD', 'rxstring': 'pregabalin 50 MG Oral Capsule'}
Now I am interested to get the drug class using 'rxcui': '483448' info using RxClass API. However, I couldn't make sense of this API. How can I use 'rxcui': '483448' info here to get the desired drug class. I appreciate your time. Thanks!
Why doesn't the script below return a photo url link? I try to modify the code but it has no effect.
import requests
import json
def get_wiki_main_image(title):
url = 'https://pl.wikipedia.org/wiki/Zamek_Kr%C3%B3lewski_na_Wawelu'
data = {
'action' :'query',
'format' : 'json',
'formatversion' : 2,
'prop' : 'pageimages|pageterms',
'piprop' : 'original',
'titles' : title
}
response = requests.get(url, data)
json_data = json.loads(response.text)
return json_data['query']['pages'][0]['original']['source'] if len(json_data['query']['pages']) >0 else 'Not found'
urllink = get_wiki_main_image('zamek królewski na wawelu')
print (urllink)
Thanks for help.
By observation, we notice that all the pictures in Wikipedia are in the folder https://upload.wikimedia.org/wikipedia/commons/thumb. If without using additional libraries:
import requests
r = requests.get('https://pl.wikipedia.org/wiki/Zamek_Kr%C3%B3lewski_na_Wawelu')
gen = r.iter_lines() # create a byte string generator
for s in gen:
# Is there such a substring, with the folder we need, in this line
if s.find(b'https://upload.wikimedia.org/wikipedia/commons/thumb') == -1:
continue
else:
ss = s.split(b'"') # split the byte string to separate the url
print(ss[3].decode('utf-8')) # take the url and convert it to a string
Console output:
https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Royal_Castle%2C_Wawel_Hill%2C_4_Wawel%2C_Old_Town%2C_Krak%C3%B3w%2C_Poland.jpg/1200px-Royal_Castle%2C_Wawel_Hill%2C_4_Wawel%2C_Old_Town%2C_Krak%C3%B3w%2C_Poland.jpg
https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Royal_Castle%2C_Wawel_Hill%2C_4_Wawel%2C_Old_Town%2C_Krak%C3%B3w%2C_Poland.jpg/800px-Royal_Castle%2C_Wawel_Hill%2C_4_Wawel%2C_Old_Town%2C_Krak%C3%B3w%2C_Poland.jpg
https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Royal_Castle%2C_Wawel_Hill%2C_4_Wawel%2C_Old_Town%2C_Krak%C3%B3w%2C_Poland.jpg/640px-Royal_Castle%2C_Wawel_Hill%2C_4_Wawel%2C_Old_Town%2C_Krak%C3%B3w%2C_Poland.jpg
There are three static pictures on the site with different sizes.
I have been doing research for a very important personal project. I would like to create a Flask Search Application that allows me to search for content across 100 Plus PDF files. I have found Some information around A ElasticSearch Lib that works well with flask.
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import json
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd
# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
# create a new PDF object with FPDF
pdf = FPDF()
# use an iterator to create 10 pages
for page in range(10):
pdf.add_page()
pdf.set_font("Arial", size=14)
pdf.cell(150, 12, txt="Object Rocket ROCKS!!", ln=1, align="C")
# output all of the data to a new PDF file
pdf.output("object_rocket.pdf")
'''
read_pdf = PyPDF2.PdfFileReader("object_rocket.pdf")
page = read_pdf.getPage(0)
page_mode = read_pdf.getPageMode()
page_text = page.extractText()
print (type(page_text))
'''
#with open(path, 'rb') as file:
# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)
# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# create a dictionary object for page data
all_pages = {}
# put meta data into a dict key
all_pages["meta"] = {}
# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages[page] = page_text
# create a JSON string from the dictionary
json_data = json.dumps(all_pages)
#print ("\nJSON:", json_data)
# convert JSON string to bytes-like obj
bytes_string = bytes(json_data, 'utf-8')
#print ("\nbytes_string:", bytes_string)
# convert bytes to base64 encoded string
encoded_pdf = base64.b64encode(bytes_string)
encoded_pdf = str(encoded_pdf)
#print ("\nbase64:", encoded_pdf)
# put the PDF data into a dictionary body to pass to the API request
body_doc = {"data": encoded_pdf}
# call the index() method to index the data
result = elastic_client.index(index="pdf", doc_type="_doc", id="42", body=body_doc)
# print the returned sresults
#print ("\nindex result:", result['result'])
# make another Elasticsearch API request to get the indexed PDF
result = elastic_client.get(index="pdf", doc_type='_doc', id=42)
# print the data to terminal
result_data = result["_source"]["data"]
#print ("\nresult_data:", result_data, '-- type:', type(result_data))
# decode the base64 data (use to [:] to slice off
# the 'b and ' in the string)
decoded_pdf = base64.b64decode(result_data[2:-1]).decode("utf-8")
#print ("\ndecoded_pdf:", decoded_pdf)
# take decoded string and make into JSON object
json_dict = json.loads(decoded_pdf)
#print ("\njson_str:", json_dict, "\n\ntype:", type(json_dict))
result2 = elastic_client.index(index="pdftext", doc_type="_doc", id="42", body=json_dict)
# create new FPDF object
pdf = FPDF()
# build the new PDF from the Elasticsearch dictionary
# Use 'iteritems()` instead of 'items()' for Python 2
""" for page, value in json_data:
if page != "meta":
# create new page
pdf.add_page()
pdf.set_font("Arial", size=14)
# add content to page
output = value + " -- Page: " + str(int(page)+1)
pdf.cell(150, 12, txt=output, ln=1, align="C")
else:
# create the meta data for the new PDF
for meta, meta_val in json_dict["meta"].items():
if "title" in meta.lower():
pdf.set_title(meta_val)
elif "producer" in meta.lower() or "creator" in meta.lower():
pdf.set_creator(meta_val)
"""
# output the PDF object's data to a PDF file
#pdf.output("object_rocket_from_elaticsearch.pdf" )
#app.route('/', methods=['GET'])
def index():
return jsonify(json_dict)
#app.route('/<id>', methods=['GET'])
def index_by_id(id):
return jsonify(json_dict[id])
""" #app.route('/insert_data', methods=['PUT'])
def insert_data():
slug = request.form['slug']
title = request.form['title']
content = request.form['content']
body = {
'slug': slug,
'title': title,
'content': content,
'timestamp': datetime.now()
}
result = es.index(index='contents', doc_type='title', id=slug, body=body)
return jsonify(result) """
app.run(port=5003, debug=True)
------Progress------ I now have a working solution with no front-end search capability:
# Load_single_PDF_BY_PAGE_TO_index.py
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd
# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
#with open(path, 'rb') as file:
# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)
# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# create a dictionary object for page data
all_pages = {}
# put meta data into a dict key
all_pages["meta"] = {}
# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value
x = 44
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages[page] = page_text
body_doc2 = {"data": page_text}
result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
x += 1
The above code loads a single pdf into elasticsearch by page.
from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
#app.route('/pdf', methods=['GET'])
def index():
results = es.get(index='pdfclearn', doc_type='_doc', id='44')
return jsonify(results['_source'])
#app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
results = es.get(index='pdfclearn', doc_type='_doc', id=id)
return jsonify(results['_source'])
#app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
keyword = keyword
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
#app.route("/searhbar")
def searhbar():
return render_template("index.html")
#app.route("/searhbar/<string:box>")
def process(box):
query = request.args.get('query')
if box == 'names':
keyword = box
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
app.run(port=5003, debug=True)
In the above code we can search across all Pages for a keyword or phrase.
curl http://127.0.0.1:5003/search/test //it works!!
I Found a blog about how to dave PDF files as a Base64 index in ElasticSearch. I have seen DocuSign's API do this for document templating. However, I dont understand How to Jsonify the Base64 PDF in a way thats searchable for ElasticSearch.
curl "http://localhost:9200/pdftext/_doc/42"
curl -X POST "http://localhost:9200/pdf/_search?q=*"
I can retrieve the Base64 of a 700 Page document. But I think what I need is to Index and retrieve Each Page of the Document.
Blogs I Have Studied that got me part the way:
https://kb.objectrocket.com/elasticsearch/how-to-index-a-pdf-file-as-an-elasticsearch-index-267
https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-xvi-full-text-search
endgame:
https://towardsdatascience.com/create-a-full-search-engine-via-flask-elasticsearch-javascript-d3js-and-bootstrap-275f9dc6efe1
I will continue to study Elastic Search and Base64 Encoding and decoding. But I would like some help getting to my goal. Any Detailed example would be much appreciated.
------Progress------
I now have a working solution with no front-end search capability:
# Load_single_PDF_BY_PAGE_TO_index.py
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
from flask import Flask, jsonify, request, render_template, json
from datetime import datetime
import pandas as pd
# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
#with open(path, 'rb') as file:
# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)
# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()
# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)
# create a dictionary object for page data
all_pages = {}
# put meta data into a dict key
all_pages["meta"] = {}
# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
print (meta, value)
all_pages["meta"][meta] = value
x = 44
# iterate the page numbers
for page in range(num):
data = read_pdf.getPage(page)
#page_mode = read_pdf.getPageMode()
# extract the page's text
page_text = data.extractText()
# put the text data into the dict
all_pages[page] = page_text
body_doc2 = {"data": page_text}
result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
x += 1
The above code loads a single pdf into elasticsearch by page.
from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)
#app.route('/pdf', methods=['GET'])
def index():
results = es.get(index='pdfclearn', doc_type='_doc', id='44')
return jsonify(results['_source'])
#app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
results = es.get(index='pdfclearn', doc_type='_doc', id=id)
return jsonify(results['_source'])
#app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
keyword = keyword
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
#app.route("/searhbar")
def searhbar():
return render_template("index.html")
#app.route("/searhbar/<string:box>")
def process(box):
query = request.args.get('query')
if box == 'names':
keyword = box
body = {
"query": {
"multi_match": {
"query": keyword,
"fields": ["data"]
}
}
}
res = es.search(index="pdfclearn", doc_type="_doc", body=body)
return jsonify(res['hits']['hits'])
app.run(port=5003, debug=True)
In the above code we can search across all Pages for a keyword or phrase.
curl http://127.0.0.1:5003/search/test //it works!!
So i found a lib called scout and...got it to work!
from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template, json
client = Scout('http://localhost:8000')
for k in range(7,18):
read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
num = read_pdf.getNumPages()
print ("PDF pages:", num)
all_pages = []
for page in range(num):
data = read_pdf.getPage(page)
page_text = data.extractText()
all_pages.append(page_text)
import requests
for z in all_pages:
url = 'http://localhost:8000/documents/'
data = {'content': z, 'indexes': ['test13']}
headers = {
'Content-Type': 'application/json',
}
response = requests.post(url, data=json.dumps(data), headers=headers)
print(response)
I can now loop though as many PDF's as I want locally
Post to the server for indexing
and search for keywords
Now I just need help with Making a basic front end with a search bar that calls data from a JSON response in python and flask.
So now Amazon has a solution for my use case. It's called AWS Textract. If you create a free AWS account, and download the Cli and Python sdk, you can use the following code:
import boto3
# Document
documentName = "test2-28.png"
# Read document content
with open(documentName, 'rb') as document:
imageBytes = document.read()
# Amazon Textract clientls
textract = boto3.client('textract')
# Call Amazon Textract
response = textract.detect_document_text(Document={'Bytes': imageBytes})
# print(response)
# Print detected text
for item in response["Blocks"]:
if item["BlockType"] == "LINE":
print('\033[94m' + item["Text"] + '\033[0m')
Make sure to convert your PDF pages to Images first. ML works off Images. I used .png files for each page. Next I will need to loop through a folder with all pages as images in it. I will also need to save to a CSV file output or DB for future analysis.
Try this - https://www.elastic.co/guide/en/elasticsearch/reference/6.8/binary.html
use store=true for this datatype as it does not store data nor allow search by default.
I'm making some data visualization from movies database api and I already access the data in the normal way but when i load the json data and for loop to print it, the data that out is just the column but I need to access the object inside.
url = "https://api.themoviedb.org/3/discover/movie?api_key="+ api_key
+"&language=en- US&sort_by=popularity.desc&include_adult=
false&include_video=false&page=1" # api url
response = urllib.request.urlopen(url)
raw_json = response.read().decode("utf-8")
data = json.loads(raw_json)
for j in data:
print(j)
i expect the output would be
[{'popularity': 15,
'id': 611,
'video': False,
'vote_count': 1403,
'vote_average': 8.9,
'title': 'lalalalo'},{....}]
but the actual output is
page
total_results
total_pages
results
The results are one level down. You are looping through the metadata.
Try changing your code to
import json
import urllib.request
api_key = "your api code"
url = "https://api.themoviedb.org/3/discover/movie?api_key=" + api_key +"&language=en- US&sort_by=popularity.desc&include_adult=false&include_video=false&page=1" # api url
response = urllib.request.urlopen(url)
raw_json = response.read().decode("utf-8")
data = json.loads(raw_json)
for j in data['results']:
print(j)
You need to change
data
to
data['results']
you can simply use requests module...
import requests
import json
your_link = " "
r = requests.get(your_link)
data = json.loads(r.content)
You shall have the json loaded up, then use your key "results" ["results"] and loop through the data you got.
I have written code for calling the AlchemyLanguage API of Bluemix in Python. I need the keywords and entities, but it is only showing the first keyword and first entity for the text file. Where am I going wrong?
import requests
import urllib
import urllib2
def call_alchemy_api(text, API_KEY):
payload = {'outputMode':'json','extract':'entities,keywords','sentiment':'1','maxRetrieve':'1', 'url':'https://www.ibm.com/us-en/'}
payload['apikey'] = API_KEY
encoded_text = urllib.quote_plus(text)
payload['text'] = text
data = urllib.urlencode(payload)
url = 'https://gateway-a.watsonplatform.net/calls/text/TextGetCombinedData'
req = urllib2.Request(url, data)
response = urllib2.urlopen(req)
return response
if __name__ == "__main__":
api_key = 'xxxxxxxxxxxxxxxxxxxxxmyapi'
f = open('in0.txt','r')
text = f.read()
print text
response = call_alchemy_api(text, api_key)
print response.read()
Change the maxRetrieve keyword's value.
Example:
payload = {'outputMode':'json','extract':'entities,keywords','sentiment':'1','maxRetrieve':'3', 'url':'https://www.ibm.com/us-en/'}
API Link:
http://www.ibm.com/watson/developercloud/alchemy-language/api/v1/