Using jsonpath in api.ai in case we have already our json file to query in? - python

import json
def makeWebhookResult(req):
if req.get("result").get("action") != "Phdapp":
return {}
result = req.get("result")
parameters = result.get("parameters")
Progr = parameters.get("PhDsubjects")
time = parameters.get("PhdTime")
Levp = parameters.get("PhDDegLevp")
with open('Sheet1.json') as f:
data = f.read()
jsondata = json.loads(data)
match = jsonpath.jsonpath(jsondata,
'$.features[[?(#.ProgramName == Progr && #.Level == Levp && #.StartDate == time)]].UniversityName')
speech = "This is the universities you were looking for " + match
This is the part of my python code which have errors i can't figure it out, I have an intent with action which is "Phdapp" with three parameters that i need to use their values in my jsonpath querying from "sheet1.json" file in the same repository on GitHub in json format. But i can't get data from my intent neither accessing my son file for querying...is it because api.ai is not compatible with jsonpath or it is the problem of my code! or if there is a best to use which easier it can be my pleasure to know it. Thanks

Related

Parsing Json with multiple "levels" with Python

I'm trying to parse a json file from an api call.
I have found this code that fits my need and trying to adapt it to what I want:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("http://fx.priceonomics.com/v1/rates/?q=1")
jsrates = json.loads(page.read())
pattern = re.compile("([A-Z]{3})_([A-Z]{3})")
for key in jsrates:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
And I've turned it into:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries")
jsrates = json.loads(page.read())
for pattern in jsrates['result'][0]['MarketName']:
for key in jsrates['result'][0]['Ask']:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
Now the problem is that there is multiple level in the json "Result > 0, 1,2 etc"
json screenshot
for key in jsrates['result'][0]['Ask']:
I want the zero to be able to be any number, I don't know if thats clear.
So I could get all the ask price to match their marketname.
I have shortened the code so it doesnt make too long of a post.
Thanks
PS: sorry for the english, its not my native language.
You could loop through all of the result values that are returned, ignoring the meaningless numeric index:
for result in jsrates['result'].values():
ask = result.get('Ask')
if ask is not None:
# Do things with your ask...

KeywordAnalyser for lucene python?

I want to use lucene using python.
I used StandardAnalyser for indexing and searching both. It works fine but now, my requirement changes, I need to use KeywordAnalyser.
Code for StandardAnalyser:
#Importing packages
import lucene
lucene.initVM()
from org.apache.lucene import analysis, document, index, queryparser, search, store, util
#Initialize Parameters
analyzer = analysis.standard.StandardAnalyzer(util.Version.LUCENE_CURRENT)
config = index.IndexWriterConfig(util.Version.LUCENE_CURRENT, self.analyzer)
directory = store.FSDirectory.open(File(<path_where_to_index>))
iwriter = None
iwriter = index.IndexWriter(directory, config)
#Indexing Part
doc = document.Document()
doc.add(document.Field("fieldname", entity, document.Field.Store.YES, document.Field.Index.ANALYZED))
doc.add(document.Field("category", category, document.Field.Store.YES, document.Field.Index.ANALYZED))
iwriter.addDocument(doc)
iwriter.commit()
#Searching Part
ireader = index.IndexReader.open(directory)
isearcher = search.IndexSearcher(ireader)
parser = queryparser.classic.QueryParser(util.Version.LUCENE_CURRENT, "fieldname", self.analyzer)
query = parser.parse(entity)
hits = isearcher.search(query, None, 100).scoreDocs
print hits
for hit in hits:
hitDoc = isearcher.doc(hit.doc)
print hitDoc
Above code is using StandardAnalyser. I want to use KeywordAnalyser instead of StandardAnalyser.
I have changed analyser in below code. Below Code is using KeywordAnalyser but searching is not performing.
Code for KeywordAnalyser:
#Importing packages
import lucene
lucene.initVM()
from org.apache.lucene import analysis, document, index, queryparser, search, store, util
#Initialize Parameters
analyzer = analysis.core.KeywordAnalyser(util.Version.LUCENE_CURRENT)
config = index.IndexWriterConfig(util.Version.LUCENE_CURRENT, self.analyzer)
directory = store.FSDirectory.open(File(<path_where_to_index>))
iwriter = None
iwriter = index.IndexWriter(directory, config)
#Indexing Part
doc = document.Document()
doc.add(document.Field("fieldname", entity, document.Field.Store.YES, document.Field.Index.ANALYZED))
doc.add(document.Field("category", category, document.Field.Store.YES, document.Field.Index.ANALYZED))
iwriter.addDocument(doc)
iwriter.commit()
#Searching Part
ireader = index.IndexReader.open(directory)
isearcher = search.IndexSearcher(ireader)
parser = queryparser.classic.QueryParser(util.Version.LUCENE_CURRENT, "fieldname", self.analyzer)
query = parser.parse(entity)
hits = isearcher.search(query, None, 100).scoreDocs
print hits
for hit in hits:
hitDoc = isearcher.doc(hit.doc)
print hitDoc
Any help?
I found the solution of my question.
To use KeywordAnalyser, I need analysis.core. I can't use queryparser for searching because it mostly works for StandardAnalyser. To do search on KeywordAnalyser, I need to use index.Term and search.TermQuery
Searching Code:
term_parser = index.Term("fieldname", entity)
query = search.TermQuery(term_parser)
hits = isearcher.search(query, None, 10).scoreDocs

Optimize Python Script to parse xml

I'm parsing the US Patent XML files (downloaded from Google patent dumps) using Python and Beautifulsoup; parsed data is exported to MYSQL database.
Each year's data contains close to 200-300K patents - which means parsing 200-300K xml files.
The server on which I'm running the python script is pretty powerful - 16 cores, 160 gigs of RAM, etc. but still it is taking close to 3 days to parse one year's worth of data.
I've been learning and using python since 2 years - so I can get stuff done but do not know how to get it done in the most efficient manner. I'm reading on it.
How can I optimize the below script to make it efficient?
Any guidance would be greatly appreciated.
Below is the code:
from bs4 import BeautifulSoup
import pandas as pd
from pandas.core.frame import DataFrame
import MySQLdb as db
import os
cnxn = db.connect('xx.xx.xx.xx','xxxxx','xxxxx','xxxx',charset='utf8',use_unicode=True)
def separated_xml(infile):
file = open(infile, "r")
buffer = [file.readline()]
for line in file:
if line.startswith("<?xml "):
yield "".join(buffer)
buffer = []
buffer.append(line)
yield "".join(buffer)
file.close()
def get_data(soup):
df = pd.DataFrame(columns = ['doc_id','patcit_num','patcit_document_id_country', 'patcit_document_id_doc_number','patcit_document_id_kind','patcit_document_id_name','patcit_document_id_date','category'])
if soup.findAll('us-citation'):
cit = soup.findAll('us-citation')
else:
cit = soup.findAll('citation')
doc_id = soup.findAll('publication-reference')[0].find('doc-number').text
for x in cit:
try:
patcit_num = x.find('patcit')['num']
except:
patcit_num = None
try:
patcit_document_id_country = x.find('country').text
except:
patcit_document_id_country = None
try:
patcit_document_id_doc_number = x.find('doc-number').text
except:
patcit_document_id_doc_number = None
try:
patcit_document_id_kind = x.find('kind').text
except:
patcit_document_id_kind = None
try:
patcit_document_id_name = x.find('name').text
except:
patcit_document_id_name = None
try:
patcit_document_id_date = x.find('date').text
except:
patcit_document_id_date = None
try:
category = x.find('category').text
except:
category = None
print doc_id
val = {'doc_id':doc_id,'patcit_num':patcit_num, 'patcit_document_id_country':patcit_document_id_country,'patcit_document_id_doc_number':patcit_document_id_doc_number, 'patcit_document_id_kind':patcit_document_id_kind,'patcit_document_id_name':patcit_document_id_name,'patcit_document_id_date':patcit_document_id_date,'category':category}
df = df.append(val, ignore_index=True)
df.to_sql(name = 'table_name', con = cnxn, flavor='mysql', if_exists='append')
print '1 doc exported'
i=0
l = os.listdir('/path/')
for item in l:
f = '/path/'+item
print 'Currently parsing - ',item
for xml_string in separated_xml(f):
soup = BeautifulSoup(xml_string,'xml')
if soup.find('us-patent-grant'):
print item, i, xml_string[177:204]
get_data(soup)
else:
print item, i, xml_string[177:204],'***********************************soup not found********************************************'
i+=1
print 'DONE!!!'
Here is a tutorial on multi-threading, because currently that code will run on 1 thread, 1 core.
Remove all try/except statements and handle the code properly. Exceptions are expensive.
Run a profiler to find the chokepoints, and multi-thread those or find a way to do them less times.
So, you're doing two things wrong. First, you're using BeautifulSoup, which is slow, and second, you're using a "find" call, which is also slow.
As a first cut, look at lxml's ability to pre-compile xpath queries (Look at the heading "The Xpath class). That will give you a huge speed boost.
Alternatively, I've been working on a library to do this kind of parsing declaratively, using best practices for lxml speed, including precompiled xpath called yankee.
Yankee on PyPI |
Yankee on GitHub
You could do the same thing with yankee like this:
from yankee.xml import Schema, fields as f
# Create a schema for citations
class Citation(Schema):
num = f.Str(".//patcit")
country = f.Str(".//country")
# ... and so forth for the rest of your fields
# Then create a "wrapper" to get all the citations
class Patent(Schema):
citations = f.List(".//us-citation|.//citation")
# Then just feed the Schema your lxml.etrees for each patent:
import lxml.etree as ET
schema = Patent()
for _, doc in ET.iterparse(xml_string, "xml"):
result = schema.load(doc)
The result will look like this:
{
"citations": [
{
"num": "<some value>",
"country": "<some value>",
},
{
"num": "<some value>",
"country": "<some value>",
},
]
}
I would also check out Dask to help you multithread it more efficiently. Pretty much all my projects use it.

Fetching language detection from Google api

I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))

How can I talk to UniProt over HTTP in Python?

I'm trying to get some results from UniProt, which is a protein database (details are not important). I'm trying to use some script that translates from one kind of ID to another. I was able to do this manually on the browser, but could not do it in Python.
In http://www.uniprot.org/faq/28 there are some sample scripts. I tried the Perl one and it seems to work, so the problem is my Python attempts. The (working) script is:
## tool_example.pl ##
use strict;
use warnings;
use LWP::UserAgent;
my $base = 'http://www.uniprot.org';
my $tool = 'mapping';
my $params = {
from => 'ACC', to => 'P_REFSEQ_AC', format => 'tab',
query => 'P13368 P20806 Q9UM73 P97793 Q17192'
};
my $agent = LWP::UserAgent->new;
push #{$agent->requests_redirectable}, 'POST';
print STDERR "Submitting...\n";
my $response = $agent->post("$base/$tool/", $params);
while (my $wait = $response->header('Retry-After')) {
print STDERR "Waiting ($wait)...\n";
sleep $wait;
print STDERR "Checking...\n";
$response = $agent->get($response->base);
}
$response->is_success ?
print $response->content :
die 'Failed, got ' . $response->status_line .
' for ' . $response->request->uri . "\n";
My questions are:
1) How would you do that in Python?
2) Will I be able to massively "scale" that (i.e., use a lot of entries in the query field)?
question #1:
This can be done using python's urllibs:
import urllib, urllib2
import time
import sys
query = ' '.join(sys.argv)
# encode params as a list of 2-tuples
params = ( ('from','ACC'), ('to', 'P_REFSEQ_AC'), ('format','tab'), ('query', query))
# url encode them
data = urllib.urlencode(params)
url = 'http://www.uniprot.org/mapping/'
# fetch the data
try:
foo = urllib2.urlopen(url, data)
except urllib2.HttpError, e:
if e.code == 503:
# blah blah get the value of the header...
wait_time = int(e.hdrs.get('Retry-after', 0))
print 'Sleeping %i seconds...' % (wait_time,)
time.sleep(wait_time)
foo = urllib2.urlopen(url, data)
# foo is a file-like object, do with it what you will.
foo.read()
You're probably better off using the Protein Identifier Cross Reference service from the EBI to convert one set of IDs to another. It has a very good REST interface.
http://www.ebi.ac.uk/Tools/picr/
I should also mention that UniProt has very good webservices available. Though if you are tied to using simple http requests for some reason then its probably not useful.
Let's assume that you are using Python 2.5.
We can use httplib to directly call the web site:
import httplib, urllib
querystring = {}
#Build the query string here from the following keys (query, format, columns, compress, limit, offset)
querystring["query"] = ""
querystring["format"] = "" # one of html | tab | fasta | gff | txt | xml | rdf | rss | list
querystring["columns"] = "" # the columns you want comma seperated
querystring["compress"] = "" # yes or no
## These may be optional
querystring["limit"] = "" # I guess if you only want a few rows
querystring["offset"] = "" # bring on paging
##From the examples - query=organism:9606+AND+antigen&format=xml&compress=no
##Delete the following and replace with your query
querystring = {}
querystring["query"] = "organism:9606 AND antigen"
querystring["format"] = "xml" #make it human readable
querystring["compress"] = "no" #I don't want to have to unzip
conn = httplib.HTTPConnection("www.uniprot.org")
conn.request("GET", "/uniprot/?"+ urllib.urlencode(querystring))
r1 = conn.getresponse()
if r1.status == 200:
data1 = r1.read()
print data1 #or do something with it
You could then make a function around creating the query string and you should be away.
check this out bioservices. they interface a lot of databases through Python.
https://pythonhosted.org/bioservices/_modules/bioservices/uniprot.html
conda install bioservices --yes
in complement to O.rka answer:
Question 1:
from bioservices import UniProt
u = UniProt()
res = u.get_df("P13368 P20806 Q9UM73 P97793 Q17192".split())
This returns a dataframe with all information about each entry.
Question 2: same answer. This should scale up.
Disclaimer: I'm the author of bioservices
There is a python package in pip which does exactly what you want
pip install uniprot-mapper

Categories

Resources