JSON File Merging With Primary Keys in Python - python

My code was working as of yesterday but today I ran the script and started getting this error saying I can't use strings to access my JSON dictionary anymore. As I understand it, I am already iterating through my dictionary entries and it is valid JSON so I should be able to access information in it like a dictionary with string values instead of just the column number:
S:\Scripts\ZACH\DB MERGE>python jsonMerge.py
Beginning Merge...
Traceback (most recent call last):
File "jsonMerge.py", line 42, in <module>
if intResult['SCH_NAME'] == extResult['SCH_NAME'] and intResult['SCH_CITY']
== extResult['SCH_CITY'] :
TypeError: list indices must be integers or slices, not str
I'm merging 2 validated JSON files like this:
[{"SCH_ID": "13554", "SCH_NAME": "100 Mile House Elementary", "SCH_ADDR": "Box 460, 145 North Birch", "SCH_CITY": "100 Mile House", "SCH_PROV": "BC", "SCH_PCODE": "V0K 2E0", "SCH_PHONE": "(250)395-2258", "SCH_FAX": "(250)395-3621", "SCH_DIST": "1027", "SCH_TYPE": "E", "SCH_P_REP": "FB", "SCH_G_REP": "", "SCH_P_COM": "LOC", "SCH_G_COM": "", "SCH_REBT": "10", "SCH_REBT2": "0", "SCH_P_CID": "23", "SCH_G_CID": "0", "SCH_P_CCD": "SR", "SCH_G_CCD": "", "DATE1": "", "DATE2": "", "PLAN1": "20G", "PLAN2": "2GR", "LNOPST": "FALSE"},{"SCH_ID": "16101", "SCH_NAME": "1 Step Ahead Preschool", "SCH_ADDR": "1340 Kingfisher Ave.", "SCH_CITY": "Kitimat", "SCH_PROV": "BC", "SCH_PCODE": "V8C 1G6", "SCH_PHONE": "(250)632-2288", "SCH_FAX": "", "SCH_DIST": "", "SCH_TYPE": "E", "SCH_P_REP": "FB", "SCH_G_REP": "", "SCH_P_COM": "P", "SCH_G_COM": "", "SCH_REBT": "0", "SCH_REBT2": "0", "SCH_P_CID": "23", "SCH_G_CID": "0", "SCH_P_CCD": "SR", "SCH_G_CCD": "", "DATE1": "", "DATE2": "", "PLAN1": "200", "PLAN2": "0", "LNOPST": "FALSE"},{"SCH_ID": "16736", "SCH_NAME": "1st Step Montessori", "SCH_ADDR": "8884 Russell Drive", "SCH_CITY": "Delta", "SCH_PROV": "BC", "SCH_PCODE": "V4C 4P8", "SCH_PHONE": "(604)417-3290", "SCH_FAX": "", "SCH_DIST": "1037", "SCH_TYPE": "E", "SCH_P_REP": "GF", "SCH_G_REP": "", "SCH_P_COM": "MWS", "SCH_G_COM": "", "SCH_REBT": "10", "SCH_REBT2": "0", "SCH_P_CID": "18", "SCH_G_CID": "0", "SCH_P_CCD": "SB", "SCH_G_CCD": "", "DATE1": "", "DATE2": "", "PLAN1": "20G", "PLAN2": "0", "LNOPST": "FALSE"},{"SCH_ID": "1959", "SCH_NAME": "150 Mile Elementary", "SCH_ADDR": "Box 259, 3081 Hwy. 97", "SCH_CITY": "150 Mile House", "SCH_PROV": "BC", "SCH_PCODE": "V0K 2G0", "SCH_PHONE": "(250)296-3356", "SCH_FAX": "(250)296-3291", "SCH_DIST": "1027", "SCH_TYPE": "E", "SCH_P_REP": "FB", "SCH_G_REP": "", "SCH_P_COM": "MWS", "SCH_G_COM": "", "SCH_REBT": "10", "SCH_REBT2": "0", "SCH_P_CID": "23", "SCH_G_CID": "0", "SCH_P_CCD": "SR", "SCH_G_CCD": "", "DATE1": "9/12/2018", "DATE2": "10/30/2018", "PLAN1": "2GS", "PLAN2": "2GR", "LNOPST": "FALSE"}]
and:
[{"District Number": "82", "School Code": "8297024", "SCH_NAME": "Na Aksa Gyilak'yoo", "SCH_ADDR": "PO Box 544 STN Main", "SCH_CITY": "Terrace", "SCH_PROV": "BC", "SCH_PCODE": "V8G 4B5", "Principal Title": "Mrs", "Principal First name": "Colleen", "Principal Last Name": "Austin", "School Type": "Standard School", "Grade Range": "K-12", "School Category": "Independent School", "Funding Group(s)": "2", "NLC: Early Learning": "no", "NLC: Afterschool": "no", "NLC: Cont. Ed.": "no", "NLC: Seniors": "no", "NLC: Comm. Sport": "no", "NLC: Comm. Use": "no", "NLC: Integr. Svcs.": "no", "SCH_PHONE": "(250)615-2844", "SCH_FAX": "(250)615-2833", "Email": "kalumteacher#gmail.com", "Enrolment Total": "80", "Enrolment As Of": "September 30 2018", "KH Enrolment": "1", "KF Enrolment": "11", "HS Registration": "0", "SU Enrolment": "0", "EU Enrolment": "0", "Grade 1 Enrolment": "2", "Grade 2 Enrolment": "8", "Grade 3 Enrolment": "4", "Grade 4 Enrolment": "5", "Grade 5 Enrolment": "4", "Grade 6 Enrolment": "8", "Grade 7 Enrolment": "4", "Grade 8 Enrolment": "9", "Grade 9 Enrolment": "5", "Grade 10 Enrolment": "11", "Grade 11 Enrolment": "3", "Grade 12 Enrolment": "5"}]
using SCH_NAME and SCH_CITY as the primary keys:
with open(extFile, 'r') as extF:
#Iterate through every entry
for extLine in extF:
hasMatched = False
#load line
extResult = json.loads(extLine)
#print ("Checking: " + intResult['SCH_NAME'] + '\n')
#Set context as the external result to start us off
#contextLine = extResult
with open(intFile, 'r') as intF:
#Iterate through every entry
for intLine in intF:
#Load line
intResult = json.loads(intLine)
#print ("Matching: " + extResult['SCH_NAME'] + '\n')
#Check if rows match
if intResult['SCH_NAME'] == extResult['SCH_NAME'] and intResult['SCH_CITY'] == extResult['SCH_CITY'] :
#We have a match
hasMatched = True
Can anyone help shine a light on what might be going wrong here?

Yea ok looking at the error and the two json files, they're coming in as lists instead of dicts. You may want to figure out why this happened, but you can do a quick workaround like so.
intResult = json.loads(intLine)[0]
note you may need to do the same for extResult

Related

"Linking" Nest Dicts

I'm fairly new to programming, so please bear with me. I've been working on a project and am learning a huge about amount dicts and lists. I'm near the end, but have been stuck on a particular portion for a week now and could use some help.
I have a dict and a list, both with nested dicts of a large amount of data. I will provide samples:
neo_dict (these are only the first three nested dicts of about 40,000):
{
"2020 M3": {
"id": "dK20M030",
"spkid": "1003699",
"full_name": " C/2020 M3 (ATLAS)",
"pdes": "2020 M3",
"name": "ATLAS",
"prefix": "C",
"neo": "Y",
"pha": "",
"H": "",
"G": "",
"M1": "14.6",
"M2": "",
"K1": "6.5",
"K2": "",
"PC": "",
"diameter": "",
"extent": "",
"albedo": "",
"rot_per": "",
"GM": "",
"BV": "",
"UB": "",
"IR": "",
"spec_B": "",
"spec_T": "",
"H_sigma": "",
"diameter_sigma": "",
"orbit_id": "JPL 17",
"epoch": "2459177.5",
"epoch_mjd": "59177",
"epoch_cal": "20201124",
"equinox": "J2000",
"e": "0.952708189",
"a": "26.81750256",
"q": "1.26824827",
"i": "23.47352818",
"om": "71.2502355",
"w": "328.4462512",
"ma": "0.208483561",
"ad": "52.36675686",
"n": "0.007097029",
"tp": "2459148.124",
"tp_cal": "20201025.62",
"per": "50725.45171",
"per_y": "138.8787179",
"moid": "0.327377",
"moid_ld": "127.4053071",
"moid_jup": "0.886293",
"t_jup": "1.46",
"sigma_e": "2.52E-06",
"sigma_a": "0.0014408",
"sigma_q": "5.38E-07",
"sigma_i": "2.93E-05",
"sigma_om": "9.94E-06",
"sigma_w": "1.70E-05",
"sigma_ma": "1.68E-05",
"sigma_ad": "0.0028134",
"sigma_n": "5.72E-07",
"sigma_tp": "1.88E-05",
"sigma_per": "4.0879",
"class": "HTC",
"producer": "Otto Matic",
"data_arc": "265",
"first_obs": "6/27/2020",
"last_obs": "3/19/2021",
"n_obs_used": "2140",
"n_del_obs_used": "",
"n_dop_obs_used": "",
"condition_code": "3",
"rms": "0.48834",
"two_body": "",
"A1": "",
"A2": "",
"A3": "",
"DT": ""
},
"2020 P4-B": {
"id": "dK20P04b",
"spkid": "1003715",
"full_name": " C/2020 P4-B",
"pdes": "2020 P4-B",
"name": "",
"prefix": "C",
"neo": "Y",
"pha": "",
"H": "",
"G": "",
"M1": "16.2",
"M2": "",
"K1": "10",
"K2": "",
"PC": "",
"diameter": "",
"extent": "",
"albedo": "",
"rot_per": "",
"GM": "",
"BV": "",
"UB": "",
"IR": "",
"spec_B": "",
"spec_T": "",
"H_sigma": "",
"diameter_sigma": "",
"orbit_id": "JPL 1",
"epoch": "2459068.5",
"epoch_mjd": "59068",
"epoch_cal": "20200807",
"equinox": "J2000",
"e": "0.909209182",
"a": "1.020070968",
"q": "0.092613078",
"i": "28.14731293",
"om": "173.4267944",
"w": "171.6064847",
"ma": "358.9621325",
"ad": "1.947528857",
"n": "0.956661933",
"tp": "2459069.585",
"tp_cal": "20200808.08",
"per": "376.3084822",
"per_y": "1.030276474",
"moid": "0.135011",
"moid_ld": "52.54223087",
"moid_jup": "3.49537",
"t_jup": "5.426",
"sigma_e": "0.16589",
"sigma_a": "1.9184",
"sigma_q": "0.031723",
"sigma_i": "4.689",
"sigma_om": "14.935",
"sigma_w": "29.714",
"sigma_ma": "2.8976",
"sigma_ad": "3.6627",
"sigma_n": "2.6988",
"sigma_tp": "0.6924",
"sigma_per": "1061.6",
"class": "ETc",
"producer": "Davide Farnocchia",
"data_arc": "1",
"first_obs": "8/6/2020",
"last_obs": "8/7/2020",
"n_obs_used": "80",
"n_del_obs_used": "",
"n_dop_obs_used": "",
"condition_code": "9",
"rms": "0.079265",
"two_body": "",
"A1": "",
"A2": "",
"A3": "",
"DT": ""
},
"2020 P4-C": {
"id": "dK20P04c",
"spkid": "1003716",
"full_name": " C/2020 P4-C",
"pdes": "2020 P4-C",
"name": "",
"prefix": "C",
"neo": "Y",
"pha": "",
"H": "",
"G": "",
"M1": "15.6",
"M2": "",
"K1": "10",
"K2": "",
"PC": "",
"diameter": "",
"extent": "",
"albedo": "",
"rot_per": "",
"GM": "",
"BV": "",
"UB": "",
"IR": "",
"spec_B": "",
"spec_T": "",
"H_sigma": "",
"diameter_sigma": "",
"orbit_id": "JPL 1",
"epoch": "2459069.5",
"epoch_mjd": "59069",
"epoch_cal": "20200808",
"equinox": "J2000",
"e": "0.939420568",
"a": "1.390541481",
"q": "0.084238213",
"i": "37.55936253",
"om": "165.2569181",
"w": "116.4217142",
"ma": "0.699590126",
"ad": "2.69684475",
"n": "0.601074288",
"tp": "2459068.336",
"tp_cal": "20200806.84",
"per": "598.927632",
"per_y": "1.639774489",
"moid": "0.335478",
"moid_ld": "130.5579733",
"moid_jup": "3.29387",
"t_jup": "4.023",
"sigma_e": "5.0775",
"sigma_a": "113.92",
"sigma_q": "0.28571",
"sigma_i": "93.792",
"sigma_om": "70.845",
"sigma_w": "359.42",
"sigma_ma": "83.269",
"sigma_ad": "220.95",
"sigma_n": "73.867",
"sigma_tp": "4.8578",
"sigma_per": "73603",
"class": "ETc",
"producer": "Davide Farnocchia",
"data_arc": "",
"first_obs": "8/7/2020",
"last_obs": "8/7/2020",
"n_obs_used": "29",
"n_del_obs_used": "",
"n_dop_obs_used": "",
"condition_code": "9",
"rms": "0.024266",
"two_body": "",
"A1": "",
"A2": "",
"A3": "",
"DT": ""
}
}
Each nested dict uses the pdes as key.
I also have a list of nested dicts. The first three nested dicts (of about 30,000) within the list are:
cad_list
[{"des": "2003 JC17", "orbit_id": "18", "jd": "2488068.556306376", "cd": "2099-Dec-31 01:21", "dist": "0.338023789278089", "dist_min": "0.337895151250112", "dist_max": "0.338152711373013", "v_rel": "22.5355709843525", "v_inf": "22.5352212003886", "t_sigma_f": "00:58", "h": "17.8"}
{"des": "2017 WS12", "orbit_id": "7", "jd": "2488068.734163492", "cd": "2099-Dec-31 05:37", "dist": "0.141595130202053", "dist_min": "0.129742488499914", "dist_max": "0.156296976341895", "v_rel": "7.51471164318556", "v_inf": "7.51220712553632", "t_sigma_f": "3_07:42", "h": "23.2"}
{"des": "2010 XB24", "orbit_id": "19", "jd": "2488069.369087819", "cd": "2099-Dec-31 20:51", "dist": "0.126306889299689", "dist_min": "0.125428658725108", "dist_max": "0.127185695415594", "v_rel": "16.6758717193855", "v_inf": "16.6746066532063", "t_sigma_f": "01:03", "h": "21.8"}]
What I want to do is create a new dict that holds all of the information from neo_dict. Then, if the value in des for each nested dict within cad_list matches any of the keys within neo_dict, I want to add that nested dict from cad_list onto the nested dict within neo_dict.
So for instance:
neo_dict
{
"2020 M3": {
"id": "dK20M030",
"spkid": "1003699",
"full_name": " C/2020 M3 (ATLAS)",}}
cad_list
[{"des": "2020 M3", "orbit_id": "18", "jd": "2488068.556306376"}]
new_dict
{
"2020 M3": {
"id": "dK20M030",
"spkid": "1003699",
"full_name": " C/2020 M3 (ATLAS)",
"orbit_id": "18",
"jd": "2488068.556306376",
}}
I tried to make it as simple as possible. Please let me know if you have any questions.
**
UPDATE:
**
I am taking it step by step. Right now, I am trying to access the 'des' key within cad_list, but I am getting a TypeError.
>>> print(cad_list[0]['des'])
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "C:\Users\xxx", line 110, in __init__
print(cad_list[0]['des'])
TypeError: string indices must be integers
What you could do is go through the values of of your cad_list
for stuff in cad_list:
then check if the "des" is in neo_dict
if stuff['des'] in neo_dict:
Then if it matches you can create a new dict from the info you get. I am not sure how you want this new dict, in a list or another dict so wasn't sure how to code that portion out.
I figured out the problem I was having. When I created cad_list by appending each dict item, I appended the dictionary incorrectly (I used json.dumps(cad_dict)). So, when I was trying to access the keys in the dict, the nested dict was actually an str. But, when using the print(cad_dict) function, it printed it out looking like a nested dict.
I ended up using type(cad_dict[0]) which told me it was an str.

Flask TypeError: string indices must be integers

I'm getting this error on my Flask project for the delete method. TypeError: string indices must be integers
How can i resolve it?
def delete(self,player_name):
global nba_players_json
nba_players_json = list(filter(lambda x : x['name'] != player_name, nba_players_json))
return {'message' : 'Item deleted'}
This is how my dictionary looks like:
"AJ Price": {
"Name": "AJ Price",
"Games Played": "26",
"MIN": "324",
"PTS": "133",
"FGM": "51",
"FGA": "137",
"FG%": "37.2",
"3PM": "15",
"3PA": "57",
"3P%": "26.3",
"FTM": "16",
"FTA": "24",
"FT%": "66.7",
"OREB": "6",
"DREB": "26",
"REB": "32",
"AST": "46",
"STL": "7",
"BLK": "0",
"TOV": "14",
"PF": "15",
"EFF": "110",
"AST/TOV": "3.29",
"STL/TOV": "0.5",
"Age": "29",
"Birth_Place": "us",
"Birthdate": "October 7, 1986",
"Collage": "University of Connecticut",
"Experience": "5",
"Height": "185",
"Pos": "PG",
"Team": "PHO",
"Weight": "81.45",
"BMI": "23.79839299"}
See below
nba_players_json = {"AJ Price": {
"Name": "AJ Price",
"Games Played": "26",
"MIN": "324",
"PTS": "133",
"FGM": "51",
"FGA": "137",
"FG%": "37.2",
"3PM": "15",
"3PA": "57",
"3P%": "26.3",
"FTM": "16",
"FTA": "24",
"FT%": "66.7",
"OREB": "6",
"DREB": "26",
"REB": "32",
"AST": "46",
"STL": "7",
"BLK": "0",
"TOV": "14",
"PF": "15",
"EFF": "110",
"AST/TOV": "3.29",
"STL/TOV": "0.5",
"Age": "29",
"Birth_Place": "us",
"Birthdate": "October 7, 1986",
"Collage": "University of Connecticut",
"Experience": "5",
"Height": "185",
"Pos": "PG",
"Team": "PHO",
"Weight": "81.45",
"BMI": "23.79839299"}}
def delete(player_name):
global nba_players_json
if player_name in nba_players_json:
del nba_players_json[player_name]
return {'message' : 'Item deleted'}
else:
return {'message' : 'Item not deleted'}
print(delete('AJ Price'))

Checking existence of two list elements in a dict based on each sentence?

I have a JSON file ...
"1": {"address": "1",
"ctag": "Ne",
"feats": "_",
"head": "6",
"lemma": "Ghani",
"rel": "SBJ",
"tag": "Ne",
"word": "Ghani"},
"2": {"address": "2",
"ctag": "AJ",
"feats": "_",
"head": "1",
"lemma": "born",
"rel": "NPOSTMOD",
"tag": "AJ",
"word": "born"},
"3": {"address": "3",
"ctag": "P",
"feats": "_",
"head": "6",
"lemma": "in",
"rel": "ADV",
"tag": "P",
"word": "in"},
"4": {"address": "4",
"ctag": "N",
"feats": "_",
"head": "3",
"lemma": "Kabul",
"rel": "POSDEP",
"tag": "N",
"word": "Kabul"},
"5": {"address": "5",
"ctag": "PUNC",
"feats": "_",
"head": "6",
"lemma": ".",
"rel": "PUNC",
"tag": "PUNC",
"word": "."},
I read the JSON file and stored in a dict.
import json
# read file
with open('../data/data.txt', 'r') as JSON_file:
obj = json.load(JSON_file)
d = dict(obj) # stored it in a dict
I extracted two list from this dict that each list contains relation from text and entities as follow:
entities(d) = ['Ghani', 'Kabul', 'Afghanistan'....]
relation(d) = ['president', 'capital', 'located'...]
Now I want to check in each sentence of dict d, if any element of entities(d) and relation(d) exist, it should be stored to another list.
What I did?
to_match = set(relation(d) + entities(d))
entities_and_relation = [[j for j in to_match if j in i]
for i in ''.join(d).split('.')[:-1]]
print(entities_and_relation)
But this return me an empty list. Can you tell me what is wrong here.
OUTPUT should be like:
[Ghani, president, Afghanistan] ...
Here I solved this problem, but I don't know how to give it a specific format for each related entities.
for i in d.values():
if i['word'].split('.')[-1] in to_match:
print('{: ^10}'.format(i['word']))
Output:
Ghani
Kabul
Born
Kabul
Captial
Afghanistan
My expected output:
(Ghani, born, Kabul), (Kabul, capital, Afghanistan) or ...
Born_in(Ghani, Kabul), Capital_of(Kabul, Afghanistan)
I don't know to map it or designed it to give me as expected output.

CSV files and dictionary

I'm trying to create CSV file, based on iterating over other CSV files. The output CSV file, is in a slightly different format to the input ones.
My question is, being quite new to Python, how I'd go about doing this?
My input is something like this:
1.csv
"Street", "Number", "Occupants"
"Test Road", "7", "4"
"Test Street", "5", "1"
"Test Avenue, "2", "6"
2.csv
"Street", "Number", "Occupants"
"Test Road", "12", "2"
"Test Street", "11", "3"
"Test Avenue, "9", "2"
3.csv
"Street", "Number", "Occupants"
"Test Road", "34", "2"
"Test Street", "22", "3"
"Test Lane", "19", "2"
expected_output.csv
"", "Street", "1", "2", "3"
"Number", "Test Road", "7", "12", "34"
"", "Test Street", "5", "11", "22"
"", "Test Avenue", "2", "9", "N/A"
"", "Test Lane", "N/A", "N/A", "19"
"Occupants", "Test Road", "4", "2", "2"
"", "Test Street", "1", "3", "3"
"", "Test Avenue", "6", "2", "N/A"
"", "Test Lane", "N/A", "N/A", "2"
So you can see across the top of the output i have Number/Occupants, Street and then the number of the input cvs file (1.csv, 2.csv, 3.csv etc)
If a particular street is not in the input file, then the output should display N/A for the Number or Occupant
I'm not sure what the best/fastest approach to this is. Either concatenating all the CSV files together first, maybe in a dictionary, then doing some complex loop or have several loops to create the output.
Based on our discussion in chat, here's a full program including some test files. You will only need to delete the labeled lines to use your own files.
#user configurable variables:
number_of_files = 4
#delete the following lines to use your own files.
f1 = open('1.csv','w')
f1.write('''"Street", "Number", "Occupants"
"Test Road", "7", "4"
"Test Street", "5", "1"
"Test Avenue", "2", "6"''')
f1.close()
f2 = open('2.csv','w')
f2.write('''"Street", "Number", "Occupants"
"Test Road", "12", "2"
"Test Street", "11", "3"
"Test Avenue", "9", "2"''')
f2.close()
f3 = open('3.csv','w')
f3.write('''"Street", "Number", "Occupants"
"Test Road", "34", "2"
"Test Street", "22", "3"
"Test Lane", "19", "2"''')
f3.close()
f4 = open('4.csv','w')
f4.write('''"Street", "Number", "Occupants"
"Test Road", "4", "7"
"Test Street", "1243", "6"
"Test Lane", "17", "1"''')
f4.close()
#stop deleteing
#the rest 'should' take care of itself.
file_set = []
for y in range(number_of_files):
exec("f"+str(y+1)+" = [x.split(',') for x in open('"+str(y+1)+".csv','r').read().replace('\"','').replace(', ',',').split('\\n')]")
exec("file_set.append(f"+str(y+1)+")")
num_files = len(file_set)
data_dict = {}
for file in file_set:
block = file_set.index(file)
for line in file:
row = file.index(line)
if line[0] not in data_dict:
data_dict[line[0]] = num_files*[['N/A','N/A']]
data_dict[line[0]][block] = line[1:]
print (data_dict)
f0 = open('output.csv','w')
f0.write(',Street')
for x in range(num_files):
f0.write(','+str(x+1))
f0.write('\n')
temp = 0
for key in data_dict:
if key != 'Street':
if temp == 0:
f0.write('Number,'+key)
temp = 1
else:
f0.write(','+key)
for x in range(num_files):
f0.write(','+data_dict[key][x][0])
f0.write('\n')
temp = 0
for key in data_dict:
if key != 'Street':
if temp == 0:
f0.write('Occupants,'+key)
temp = 1
else:
f0.write(','+key)
for x in range(num_files):
f0.write(','+data_dict[key][x][1])
f0.write('\n')
f0.close()
Enjoy, and have a great day.

Best way to speed up the process of creating dictionaries when dealing with a large amount of netflow data in a .json file in Python?

I'm currently working on sorting netflow data in a json file based on end time. I'm placing all of this data into dictionaries in which keys are the end time (but only the hour and minute, so that multiple data values fall under one time). However, this is taking a bit long - not longer than a few second, but that's still too long. What's a good way to better the big O of this? What I'm doing right now is just going through the file line by line, and extracting the end times, and creating an empty dictionary (where the values are empty sets) and the keys are the hour/min of the end time. Then, I just go through the dictionary and add the lines that have the corresponding endtime to the given key to the value which is a set.
edit: Here is a sample of the kind of json data. The following is one line of it. The files I'm working with are close to 300,000 lines.
{
"#timestamp": "2015-05-18T19:26:08.000Z",
"netflow": {
"version": "9",
"flow_seq_num": "188185",
"flowset_id": "257",
"last_switched": "2015-05-15T14:28:02.999Z",
"first_switched": "2015-05-15T14:27:38.999Z",
"in_bytes": "71",
"in_pkts": "1",
"input_snmp": "5",
"output_snmp": "4",
"ipv4_src_addr": "192.1.44.133",
"ipv4_dst_addr": "10.10.1.4",
"protocol": "6",
"src_tos": "0",
"dst_tos": "2",
"l4_src_port": "12373",
"l4_dst_port": "80",
"flow_sampler_id": "0",
"ipv4_next_hop": "10.10.1.5",
"dst_mask": "2",
"src_mask": "31",
"tcp_flags": "6",
"direction": "0"
},
"#version": "1",
"host": "192.168.19.202",
"src_host_name": "",
"dst_host_name": "",
"app_name": "",
"tcp_flags_str": "",
"dscp": "",
"highval": "",
"src_blacklisted": "0",
"dst_blacklisted": "0",
"invalid_ToS": "0",
"bytes_per_packet": 71,
"tcp_nominal_payload": "0",
"malformed_ip": "0",
"empty_tcp": "0",
"short_tcp_handshake": "0",
"icmp_malformed_packets": "0",
"snort_attack_flow": "0",
"empty_udp": "0",
"short_udp": "0",
"short_tcp_rstack": "0",
"short_tcp_pansf": "0",
"short_tcp_synack": "0",
"short_tcp_synrst": "0",
"short_tcp_finack": "0",
"short_tcp_pna": "0",
"non_unicast_src": "0",
"multicast": "0",
"broadcast": "0",
"network": "0",
"tcp_urg": "0",
"land_attack": "0",
"short_tcp_ack": "0",
"tcp_synfin": "0",
"tcp_fin": "0",
"malformed_tcp": "1",
"tcp_xmas": "0",
"udp_echo_req": "0",
"tcp_null": "0",
"tcp_syn": "0",
"malformed_udp": "0",
"tcp_rst": "0",
"icmp_request": "0",
"icmp_response": "0",
"icmp_port_unreachable": "0",
"icmp_host_unreachable": "0",
"icmp_unreachable_for_Tos": "0",
"icmp_network_unreachable": "0",
"icmp_redirects": "0",
"icmp_time_exceeded_flows": "0",
"icmp_parameter_problem_flows": "0",
"icmp_trace_route": "0",
"icmp_datagram": "0",
"udp_echo_chargen_broadcast": "0",
"udp_chargen_echo_broadcast": "0",
"icmp_src_quench": "0",
"icmp_proto_unreachable": "0",
"udp_echo_broadcast": "0",
"udp_echo_rsp": "0"
}
As for code I have tried, currently I'm just converting these lines into dictionaries to access the different values I'm looking to sort by. It's really simple, I'm just using json.loads and such to create dictionaries. What kind of data structure is best for organizing this kind of thing? I'm using a dictionary for now, but is there a better one?

Categories

Resources