How to merge multiple JSON files in Python - python

I have had to create multple JSON files for reasons of processing a corpus (using GNRD http://gnrd.globalnames.org/ for scientific name extraction). I now want to use these JSON files to annotate said corpus as a whole.
I am trying to merge the multiple JSON files in Python. The contents of each JSON files are arrays of just scientific_name (key) and the name found (value). Below is an example of one of the shorter files:
{
"file":"biodiversity_trophic_9.txt",
"names":[
{
"scientificName":"Bufo"
},
{
"scientificName":"Eleutherodactylus talamancae"
},
{
"scientificName":"E. punctariolus"
},
{
"scientificName":"Norops lionotus"
},
{
"scientificName":"Centrolenella prosoblepon"
},
{
"scientificName":"Sibon annulatus"
},
{
"scientificName":"Colostethus flotator"
},
{
"scientificName":"C. inguinalis"
},
{
"scientificName":"Eleutherodactylus"
},
{
"scientificName":"Hyla columba"
},
{
"scientificName":"Bufo haematiticus"
},
{
"scientificName":"S. annulatus"
},
{
"scientificName":"Leptodeira septentrionalis"
},
{
"scientificName":"Imantodes cenchoa"
},
{
"scientificName":"Oxybelis brevirostris"
},
{
"scientificName":"Cressa"
},
{
"scientificName":"Coloma"
},
{
"scientificName":"Perlidae"
},
{
"scientificName":"Hydropsychidae"
},
{
"scientificName":"Hyla"
},
{
"scientificName":"Norops"
},
{
"scientificName":"Hyla colymbiphyllum"
},
{
"scientificName":"Colostethus inguinalis"
},
{
"scientificName":"Oxybelis"
},
{
"scientificName":"Rana warszewitschii"
},
{
"scientificName":"R. warszewitschii"
},
{
"scientificName":"Rhyacophilidae"
},
{
"scientificName":"Daphnia magna"
},
{
"scientificName":"Hyla colymba"
},
{
"scientificName":"Centrolenella"
},
{
"scientificName":"Orconectes nais"
},
{
"scientificName":"Orconectes neglectus"
},
{
"scientificName":"Campostoma anomalum"
},
{
"scientificName":"Caridina"
},
{
"scientificName":"Decapoda"
},
{
"scientificName":"Atyidae"
},
{
"scientificName":"Cerastoderma edule"
},
{
"scientificName":"Rana aurora"
},
{
"scientificName":"Riffle"
},
{
"scientificName":"Calopterygidae"
},
{
"scientificName":"Elmidae"
},
{
"scientificName":"Gyrinidae"
},
{
"scientificName":"Gerridae"
},
{
"scientificName":"Naucoridae"
},
{
"scientificName":"Oligochaeta"
},
{
"scientificName":"Veliidae"
},
{
"scientificName":"Libellulidae"
},
{
"scientificName":"Philopotamidae"
},
{
"scientificName":"Ephemeroptera"
},
{
"scientificName":"Psephenidae"
},
{
"scientificName":"Baetidae"
},
{
"scientificName":"Corduliidae"
},
{
"scientificName":"Zygoptera"
},
{
"scientificName":"B. buto"
},
{
"scientificName":"C. euknemos"
},
{
"scientificName":"C. ilex"
},
{
"scientificName":"E. padi noblei"
},
{
"scientificName":"E. padi"
},
{
"scientificName":"E. bufo"
},
{
"scientificName":"E. butoni"
},
{
"scientificName":"E. crassi"
},
{
"scientificName":"E. cruentus"
},
{
"scientificName":"H. colymbiphyllum"
},
{
"scientificName":"N. aterina"
},
{
"scientificName":"S. ilex"
},
{
"scientificName":"Anisoptera"
},
{
"scientificName":"Riffle delta"
}
],
"total":67,
"status":200,
"unique":true,
"engines":[
"TaxonFinder",
"NetiNeti"
],
"verbatim":false,
"input_url":null,
"token_url":"http://gnrd.globalnames.org/name_finder.html?token=2rtc4e70st",
"parameters":{
"engine":0,
"return_content":false,
"best_match_only":false,
"data_source_ids":[
],
"detect_language":true,
"all_data_sources":false,
"preferred_data_sources":[
]
},
"execution_time":{
"total_duration":3.1727607250213623,
"find_names_duration":1.9656541347503662,
"text_preparation_duration":1.000107765197754
},
"english_detected":true
}
The issue I have is that there may be duplicates across the files, which I want to remove (otherwise I could just concatenate the files I guess). The queries I have seen otherwise are referring to merging extra keys and values to extend the arrays themselves.
Can anyone give me guidance on how to overcome this issue?

If I understand correctly, you want to get all "scientificNames" values in the "names" elements of a batch of files. If I'm wrong, you should give an expected output to make things easier to understand.
I'd do something like that:
all_names = set() # use a set to avoid duplicates
# put all your files in there
for filename in ('file1.json', 'file2.json', ....):
try:
with open(filename, 'rt') as finput:
data = json.load(finput)
for name in data.get('names'):
all_names.add(name.get('scientificName')
except Exception as exc:
print("Skipped file {} because exception {}".format(filename, str(exc))
print(all_names)
And in case you want to get a similar format than the initial files, add:
import pprint
pprint({"names:": {"scientificName": name for name in all_names}, "total": len(all_names)})

Related

Efficient way to calculate metrics from nested JSON, via python?

What is the most efficient way to calculate metrics from nested JSON, via python?
Given the JSON blob below, how can I calculate the user (ie profileId) with the most events - without using the pandas library and not having multiple nested for loops? I am having trouble writing the code the would not rely on O(N2).
{
"kind":"admin#reports#activities",
"etag":"\"5g8\"",
"nextPageToken":"A:1651795128914034:-4002873813067783265:151219070090:C02f6wppb",
"items":[
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:59:39.421Z",
"uniqueQualifier":"5526793068617678141",
"applicationName":"token",
"customerId":"cds"
},
"etag":"\"jkYcURYoi8\"",
"actor":{
"email":"blah#blah.net",
"profileId":"1323"
},
"ipAddress":"107.178.193.87",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"admin"
},
{
"name":"method_name",
"value":"directory.users.list"
},
{
"name":"client_id",
"value":"722230783769-dsta4bi9fkom72qcu0t34aj3qpcoqloq.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"7158"
},
{
"name":"product_bucket",
"value":"GSUITE_ADMIN"
},
{
"name":"app_name",
"value":"Untitled project"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
},
{
"kind":"admin#reports#activity",
"id":{
"time":"2022-05-05T23:58:48.914Z",
"uniqueQualifier":"-4002873813067783265",
"applicationName":"token",
"customerId":"df"
},
"etag":"\"5T53xK7dpLei95RNoKZd9uz5Xb8LJpBJb72fi2HaNYM/9DTdB8t7uixvUbjo4LUEg53_gf0\"",
"actor":{
"email":"blah.blah#bebe.net",
"profileId":"1324"
},
"ipAddress":"54.80.168.30",
"events":[
{
"type":"auth",
"name":"activity",
"parameters":[
{
"name":"api_name",
"value":"gmail"
},
{
"name":"method_name",
"value":"gmail.users.messages.list"
},
{
"name":"client_id",
"value":"927538837578.apps.googleusercontent.com"
},
{
"name":"num_response_bytes",
"intValue":"2"
},
{
"name":"product_bucket",
"value":"GMAIL"
},
{
"name":"app_name",
"value":"Zapier"
},
{
"name":"client_type",
"value":"WEB"
}
]
}
]
}
]
}
This would be a good place to start:
data = # your dict.
ids = [x['actor']['profileId'] for x in data['items']]
print(ids)
Output:
['1323', '1324']

python parsing strange JSON data

How should I parse (with Python3) data in this "unusual" format?
As you can see inside the "variables" dictionary the data that is in capitals has no label, it is provided as a literal. Therefore when I loop over the entries inside "variables" all I get is the strings in capitals, nothing else. I need, obviously, to get the capitals plus the value inside it.
{
"variables": {
"ABSENCE_OSL_PROD": {
"value": "REZWWnBTejN5Ng=="
},
"ACTION_OSL_INT": {
"value": "S0RXSVNTbmFhNw=="
},
"ACTION_OSL_PROD": {
"value": "RUJCaDJGnmFnUg=="
},
"API_STORE_OSL_INT": {
"value": "U3lxaVhogWtIcg=="
}
},
"id": 4,
"type": "Vsts"
}
To load the variables inside variables in the local variable space:
data = {
"variables": {
"ABSENCE_OSL_PROD": {
"value": "REZWWnBTejN5Ng=="
},
"ACTION_OSL_INT": {
"value": "S0RXSVNTbmFhNw=="
},
"ACTION_OSL_PROD": {
"value": "RUJCaDJGnmFnUg=="
},
"API_STORE_OSL_INT": {
"value": "U3lxaVhogWtIcg=="
}
},
"id": 4,
"type": "Vsts"
}
for variable_name, variable_content in data['variables'].items():
locals()[variable_name] = variable_content['value']
print(ABSENCE_OSL_PROD)
# prints "REZWWnBTejN5Ng=="
With dict comprehension you can get a time efficient manner:
ugly = {
"variables": {
"ABSENCE_OSL_PROD": {
"value": "REZWWnBTejN5Ng=="
},
"ACTION_OSL_INT": {
"value": "S0RXSVNTbmFhNw=="
},
"ACTION_OSL_PROD": {
"value": "RUJCaDJGnmFnUg=="
},
"API_STORE_OSL_INT": {
"value": "U3lxaVhogWtIcg=="
}
},
"id": 4,
"type": "Vsts"
}
proper = {elt: ugly["variables"][elt]["value"] for elt in ugly["variables"]}
print(proper)
returns
{'ABSENCE_OSL_PROD': 'REZWWnBTejN5Ng==', 'ACTION_OSL_INT': 'S0RXSVNTbmFhNw==', 'ACTION_OSL_PROD': 'RUJCaDJGnmFnUg==', 'API_STORE_OSL_INT': 'U3lxaVhogWtIcg=='}```

PySpark - Convert a heterogeneous array JSON array to Spark dataframe and flatten it

I have streaming data coming in as JSON array and I want flatten it out as a single row in a Spark dataframe using Python.
Here is how the JSON data looks like:
{
"event": [
{
"name": "QuizAnswer",
"count": 1
}
],
"custom": {
"dimensions": [
{
"title": "Are you:"
},
{
"question_id": "5965"
},
{
"option_id": "19029"
},
{
"option_title": "Non-binary"
},
{
"item": "Non-binary"
},
{
"tab_index": "3"
},
{
"tab_count": "4"
},
{
"tab_initial_index": "4"
},
{
"page": "home"
},
{
"environment": "testing"
},
{
"page_count": "0"
},
{
"widget_version": "2.2.44"
},
{
"session_count": "1"
},
{
"quiz_settings_id": "1020"
},
{
"quiz_session": "6e5a3b5c-9961-4c1b-a2af-3374bbeccede"
},
{
"shopify_customer_id": "noid"
},
{
"cart_token": ""
},
{
"app_version": "2.2.44"
},
{
"shop_name": "safety-valve.myshopify.com"
}
],
"metrics": []
}
}
}

Improving elasticsearch performance

I'm using elasticsearch in a python web app in order to query news documents. There're actually 100000 documents in the database.
The original db is a mongo one and elasticsearch is plugged through the mongoriver plugin.
The problem is that the function takes ~850ms to return the results. I'd like to decrease that number as much as possible.
Here's the python code I'm using to query the db(the limit is usually 16):
def search_news(term, limit, page, flagged_articles):
query = {
"query": {
"from": page*limit,
"size": limit,
"multi_match" : {
"query" : term,
"fields" : [ "title^3" , "category^5" , "entities.name^5", "art_text^1", "summary^1"]
}
},
"filter" : {
"not" : {
"filter" : {
"ids" : {
"values" : flagged_articles
}
},
"_cache" : True
}
}
}
es_query = json_util.dumps(query)
uri = 'http://localhost:9200/newsidx/_search'
r = requests.get(uri, data=es_query)
results = json.loads( r.text )
data = []
for res in results['hits']['hits']:
data.append(res['_source'])
return data
And here's the index mapping:
{
"news": {
"properties": {
"actual_rank": {
"type": "long"
},
"added": {
"type": "date",
"format": "dateOptionalTime"
},
"api_id": {
"type": "long"
},
"art_text": {
"type": "string"
},
"category": {
"type": "string"
},
"downvotes": {
"type": "long"
},
"entities": {
"properties": {
"etype": {
"type": "string"
},
"name": {
"type": "string"
}
}
},
"flags": {
"properties": {
"a": {
"type": "long"
},
"b": {
"type": "long"
},
"bad_image": {
"type": "long"
},
"c": {
"type": "long"
},
"d": {
"type": "long"
},
"innapropiate": {
"type": "long"
},
"irrelevant_info": {
"type": "long"
},
"miscategorized": {
"type": "long"
}
}
},
"media": {
"type": "string"
},
"published": {
"type": "string"
},
"published_date": {
"type": "date",
"format": "dateOptionalTime"
},
"show": {
"type": "boolean"
},
"source": {
"type": "string"
},
"source_rank": {
"type": "double"
},
"summary": {
"type": "string"
},
"times_showed": {
"type": "long"
},
"title": {
"type": "string"
},
"top_entities": {
"properties": {
"einfo_test": {
"type": "string"
},
"etype": {
"type": "string"
},
"name": {
"type": "string"
}
}
},
"tweet_article_poster": {
"type": "string"
},
"tweet_favourites": {
"type": "long"
},
"tweet_retweets": {
"type": "long"
},
"tweet_user_rank": {
"type": "double"
},
"upvotes": {
"type": "long"
},
"url": {
"type": "string"
}
}
}
}
Edit: The response time was measured on the server, given the tornado server information output.
I've rewritten your query somewhat here, moving the size and limit to the outside scope, adding the filtered query clause and changing your not query to a bool/must_not query, which should be cached by default:
{
"query": {
"filtered": {
"query": {
"multi_match" : {
"query" : term,
"fields" : [ "title^3" , "category^5" , "entities.name^5", "art_text^1", "summary^1"]
}
},
"filter" : {
"bool" : {
"must_not" : {
"ids" : {"values" : flagged_articles}
}
}
}
}
}
"from": page * limit,
"size": limit,
}
I haven't tested this, and I haven't made sense of your mapping as it is jumbled, so there might be some improvements to be made there.
Edit: This is a great read on why to use the bool filter: http://www.elasticsearch.org/blog/all-about-elasticsearch-filter-bitsets/ - in short, bool uses 'bitsets', which are very fast on subsequent queries.
First of all you can add the boosts to your mapping (assuming it doesn't interfere with your other queries) like this:
"title": {
"boost": 3.0,
"type": "string"
},
"category": {
"boost": 5.0,
"type": "string"
},
etc.
Then setup a bool query with field (or term) queries like this:
"query": {
"bool" : {
"should" : [ {
"field" : {
"title" : term
}
}, {
"field" : {
"category" : term
}
} ],
"must_not" : {
"ids" : {"values" : flagged_articles}
}
}
}
"from": page * limit,
"size": limit
This should perform better, but without access to your setup I can't test it :)

Filter timestamp with mongoDb

suppose this Json export from my mongoDb
{
"_id":{
"$oid":"51ca002d9e67460354bb0089"
},
"node":{
"id":"1",
"components":[
{
"sensors":[
{
"name":"backup-job-name",
"value":"Job_2"
},
{
"name":"backup-job-id",
"value":"187"
},
{
"name":"backup-start-date",
"value":"1372138227"
},
{
"name":"backup-stop-date",
"value":"1372138235"
},
{
"name":"backup-nb-errors",
"value":"0"
},
{
"name":"backup-nb-warnings",
"value":"0"
},
{
"name":"backup-source-files",
"value":"402"
},
{
"name":"backup-source-bytes",
"value":"168571449"
},
{
"name":"backup-status",
"value":null
}
],
"type":"backup",
"serial":"1-backup-backup-job-name-Job_2"
},
{
"sensors":[
{
"name":"backup-job-name",
"value":"Test_bckp"
},
{
"name":"backup-job-id",
"value":""
},
{
"name":"backup-start-date",
"value":"0"
},
{
"name":"backup-stop-date",
"value":"0"
},
{
"name":"backup-nb-errors",
"value":"0"
},
{
"name":"backup-nb-warnings",
"value":"0"
},
{
"name":"backup-source-files",
"value":"0"
},
{
"name":"backup-source-bytes",
"value":"0"
},
{
"name":"backup-status",
"value":null
}
],
"type":"backup",
"serial":"1-backup-backup-job-name-Test_bckp"
}
]
},
"timestamp":1372192813
}
I work with python and i'd like to get documents where "backup-start-date" (stored as timestamp) il greater than a given value.
I've tried with
collection.find({
'node.components.type': 'backup',
'node.components.sensors': {'name': 'backup-start-date', 'value': {'$gte': ts_to_compare_with}}}):
but any documents in the result. Is my query wrong ?
There are two things - you are wanting to match an sensor document in the components array and $gte only works on integers and dates.
If you convert the data to be ints then you can use $gte and query like so:
db.test.find({
'node.components.type': 'backup',
'node.components.sensors': {
$elemMatch: {'name': 'backup-start-date',
'value': {'$gte': 168571445}}}})

Categories

Resources