I have attached a json data link for download-
json data
Currently I have written following function for getting each level of children data into a combined dataframe-
def get_children(catMapping):
level4 = json_normalize(catMapping['SuccessResponse']['Body'],
['children', 'children', 'children', 'children', ['children']])
level3 = json_normalize(catMapping['SuccessResponse']['Body'],
['children', 'children', 'children', ['children']])
['children', 'children', ['children']])
level1 = json_normalize(catMapping['SuccessResponse']['Body'],
['children', ['children']])
level0 = json_normalize(catMapping['SuccessResponse']['Body'],
['children'])
combined = pd.concat([level0, level1, level2, level3,level4])
combined = combined.reset_index(drop=True)
return combined
And it looks like this is not the recommended way but I am unable to write a function which can traverse each level.
Can you please help me with any better function?
Here is a function that recursively iterate all items:
import pandas as pd
import ast
with open(r"data.json", "r") as f:
data = ast.literal_eval(f.read())
def nest_iter(items):
for item in items:
children_ids = [o["categoryId"] for o in item["children"]]
ret_item = item.copy()
ret_item["children"] = children_ids
yield ret_item
yield from nest_iter(item["children"])
df = pd.DataFrame(nest_iter(data['SuccessResponse']['Body']))
the result:
categoryId children leaf name var
....
4970 10001244 [] True Business False
4971 10001245 [] True Casual False
4972 10001246 [] True Fashion False
4973 10001247 [] True Sports False
4974 7756 [7761, 7758, 7757, 7759, 7760] False Women False
4975 7761 [] True Accessories False
4976 7758 [] True Business False
4977 7757 [] True Casual False
4978 7759 [] True Fashion False
4979 7760 [] True Sports False
Related
I'm working with a stored procedure in which I pass it a start and end date and it returns data. Im passing it ten different dates and making ten calls to it, see below:
match1 = sp_data(startDate = listOfDates[0], endDate=listOfDates[0])
match2 = sp_data(startDate = listOfDates[1], endDate=listOfDates[1])
match3 = sp_data(startDate = listOfDates[2], endDate=listOfDates[2])
match4 = sp_data(startDate = listOfDates[3], endDate=listOfDates[3])
match5 = sp_data(startDate = listOfDates[4], endDate=listOfDates[4])
match6 = sp_data(startDate = listOfDates[5], endDate=listOfDates[5])
match7 = sp_data(startDate = listOfDates[6], endDate=listOfDates[6])
match8 = sp_data(startDate = listOfDates[7], endDate=listOfDates[7])
match9 = sp_data(startDate = listOfDates[8], endDate=listOfDates[8])
match10 = sp_data(startDate = listOfDates[9], endDate=listOfDates[9])
See listOfDates pandas series below:
print(listOfDates)
0 20220524
1 20220613
2 20220705
3 20220713
4 20220720
5 20220805
6 20220903
7 20220907
8 20220928
9 20221024
Name: TradeDate, dtype: object
Is there a better and more efficient way of doing this? Potentially in a loop of some kind?
Any help greatly appreciated, thanks!
You could use a list comprehension to make a list of matches:
matches = [sp_data(startDate=trade_date, endDate=trade_date) for trade_date in listOfDates]
I have these two function and when I run them my kernel dies so freaking quickly. What can I do to prevent it? It happens after appending about 10 files to the dataframe. Unfortunately json files are such big (approx. 150 MB per one, having dozens of them) and I have no idea how to join it together.
import os
import pandas as pd
from pandas.io.json import json_normalize
import json
def filtering_nodes(df):
id_list = df.index.tolist()
print("Dropping rows without 4 nodes and 3 members...")
for x in id_list:
if len(df['Nodes'][x]) != 4 and len(df['Members'][x]) != 3:
df = df.drop(x)
print("Converting to csv...")
df.to_csv("whole_df.csv", sep='\t')
return df
def merge_JsonFiles(filename):
result = list()
cnt = 0
df_all = None
data_all = None
for f1 in filename:
print("Appending file: ", f1)
with open('../../data' + f1, 'r') as infile:
data_all = json.loads(infile.read())
if cnt == 0:
df_all = pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-")
else:
df_all = df_all.append(pd.json_normalize(data_all, record_path =['List2D'], max_level =2 ,sep = "-"), ignore_index = True)
cnt += 1
return df_all
files = os.listdir('../../data')
df_all_test = merge_JsonFiles(files)
df_all_test_drop = filtering_nodes(df_all_test)
EDIT:
Due to #jlandercy answer, I've made this:
def merging_to_csv():
for path in pathlib.Path("../../data/loads_data/Dane/hilti/").glob("*.json"):
# Open source file one by one:
with path.open() as handler:
df = pd.json_normalize(json.load(handler), record_path =['List2D'])
# Identify rows to drop (boolean indexing):
q = (df["Nodes"] != 4) & (df["Members"] != 3)
# Inplace drop (no extra copy in RAM):
df.drop(q, inplace=True)
# Append data to disk instead of RAM:
df.to_csv("output.csv", mode="a", header=False)
merging_to_csv()
and I have this type of error:
KeyError Traceback (most recent call last)
<ipython-input-55-cf18265ca50e> in <module>
----> 1 merging_to_csv()
<ipython-input-54-698c67461b34> in merging_to_csv()
51 q = (df["Nodes"] != 4) & (df["Members"] != 3)
52 # Inplace drop (no extra copy in RAM):
---> 53 df.drop(q, inplace=True)
54 # Append data to disk instead of RAM:
55 df.to_csv("output.csv", mode="a", header=False)
/opt/conda/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4906 level=level,
4907 inplace=inplace,
-> 4908 errors=errors,
4909 )
4910
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors)
4148 for axis, labels in axes.items():
4149 if labels is not None:
-> 4150 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4151
4152 if inplace:
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors)
4183 new_axis = axis.drop(labels, level=level, errors=errors)
4184 else:
-> 4185 new_axis = axis.drop(labels, errors=errors)
4186 result = self.reindex(**{axis_name: new_axis})
4187
/opt/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors)
6016 if mask.any():
6017 if errors != "ignore":
-> 6018 raise KeyError(f"{labels[mask]} not found in axis")
6019 indexer = indexer[~mask]
6020 return self.delete(indexer)
KeyError: '[ True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True True True True True True True True True True True True\n True] not found in axis'
What's wrong? I'll upload two smallest json files here:
https://drive.google.com/drive/folders/1xlC-kK6NLGr0isdy1Ln2tzGmel45GtPC?usp=sharing
You are facing multiple issue in your original approach:
Multiple copy of dataframe: df = df.drop(...);
Whole information stored in RAM because of append;
Unnecessary for loop to filter rows, use boolean indexing instead.
Here is baseline snippet to solve your problem based on data sample you provided:
import json
import pathlib
import pandas as pd
# Iterate source files:
for path in pathlib.Path(".").glob("result*.json"):
# Open source file one by one:
with path.open() as handler:
# Normalize JSON model:
df = pd.json_normalize(json.load(handler), record_path =['List2D'], max_level=2, sep="-")
# Apply len to list fields to identify rows to drop (boolean indexing):
q = (df["Nodes"].apply(len) != 4) & (df["Members"].apply(len) != 3)
# Filter and append data to disk instead of RAM:
df.loc[~q,:].to_csv("output.csv", mode="a", header=False)
It loads file one by one in RAM then append filtered rows to disk not to RAM. Those fixes will drastically reduce RAM usage and should be kept as high as twice the biggest JSON file.
I am trying to create a new column based on conditions on other column.
(the data frame is already aggragated by user)
this is a sample of the data frame:
event_names country
["deleteobject", "getobject"] ["us"]
["getobject"] ["ca"]
["deleteobject", "putobject"] ["ch"]
I want to create 3 new columns:
was data deleted?
was data downloaded?
did the events come from my whitelisted countries?
WHITELISTED_COUNTRIES = ["us", "sg"]
like this:
event_names country was_data_deleted? was_data_downloaded? whitelisted_country?
["deleteobject","getobject"] ["us"] True True True
["getobject"] ["ca"] False True False
["deleteobject","putobject"] ["ch"] True False False
This is what I tried so far:
result_df['was_data_deleted'] = result_df['event_name'].apply(lambda x:True if any("delete" in x for i in x) else False)
result_df['was_data_downloaded'] = result_df['event_name'].apply(lambda x:True if "getObject" in i for i in x else False)
result_df['strange_countries'] = result_df['country'].apply(lambda x:False if any(x in WHITELISTED_COUNTRIES for x in result_df['country']) else False)
I get an Error "SyntaxError: invalid syntax"
any ideas? thanks!
df['was_data_deleted'] = df['event_names'].apply(lambda x: 'deleteobject' in x)
df['was_data_downloaded'] = df['event_names'].apply(lambda x: 'getobject' in x)
df['whitelisted_country'] = df['country'].apply(lambda x: x[0] in WHITELISTED_COUNTRIES)
print(df)
Prints:
event_names country was_data_deleted was_data_downloaded whitelisted_country
0 [deleteobject, getobject] [us] True True True
1 [getobject] [ca] False True False
2 [deleteobject, putobject] [ch] True False False
You can simplify your lambda function with remove if-else and True, False, because compared values already return it:
WHITELISTED_COUNTRIES = ["us", "sg"]
#checked substring delete
f1 = lambda x: any("delete" in i for i in x)
result_df['was_data_deleted'] = result_df['event_names'].apply(f1)
#checked string "getobject"
f2 = lambda x:"getobject" in x
result_df['was_data_downloaded'] = result_df['event_names'].apply(f2)
#checked list
f3 = lambda x:any(y in WHITELISTED_COUNTRIES for y in x)
result_df['strange_countries'] = result_df['country'].apply(f3)
print (result_df)
event_names country was_data_deleted was_data_downloaded \
0 [deleteobject, getobject] [us] True True
1 [getobject] [ca] False True
2 [deleteobject, putobject] [ch] True False
strange_countries
0 True
1 False
2 False
I have data in the following format:
1_engineer_grade1 |Boolean IsMale IsNorthAmerican IsFromUSA |Name blah
2_lawyer_grade7 |Boolean IsFemale IsAlive |Children 2
I need to convert this into a dataframe with the following columns:
id job grade Bool.IsMale Bool.IsFemale Bool.IsAlive Bool.IsNorthAmerican Bool.IsFromUSA Name Children
1 engineer 1 True False False True True blah NaN
2 lawyer 7 False True True True False NaN 2
I could preprocess this data in python and then call pd.DataFrame on this, but I was wondering if there was a better way of doing this?
UPDATE: I ended up doing the following: If there are obvious optimizations, please let me know
with open(vwfile, encoding='latin-1') as f:
data = []
for line in f:
line = [x.strip() for x in line.strip().split('|')]
# line == [
# "1_engineer_grade1",
# "|Boolean IsMale IsNorthAmerican IsFromUSA",
# "|Name blah"
# ]
ident, job, grade = line[0].split("_")
features = line[1:]
bools = {
"IsMale": False,
"IsFemale": False,
"IsNorthAmerican": False,
"IsFromUSA": False,
"IsAlive": False,
}
others = {}
for category in features:
if category.startswith("Bools "):
for feature in category.split(' ')[1:]:
bools[feature] = True
else:
feature = category.split(" ")
# feature == ["Name", "blah"]
others[feature[0]] = feature[1]
featuredict = {
'ident': ident,
'job': job,
'grade': grade,
}
featuredict.update(bools)
featuredict.update(others)
data.append(featuredict)
df = pd.DataFrame(data)
UPDATE-2 A million line file took about 55 seconds to process this.
I have used Python NLTK library and the Naive Bayes classifier to detect if a string should be tagged "php" or not, based on training data (Stackoverflow questions in fact).
The classifier seem to find interesting features:
Most Informative Features
contains-word-isset = True True : False = 125.6 : 1.0
contains-word-echo = True True : False = 28.1 : 1.0
contains-word-php = True True : False = 17.1 : 1.0
contains-word-this- = True True : False = 16.0 : 1.0
contains-word-mysql = True True : False = 14.3 : 1.0
contains-word-_get = True True : False = 11.7 : 1.0
contains-word-foreach = True True : False = 7.6 : 1.0
Features are defined as follows:
def features(question):
features = {}
for token in detectorTokens:
featureName = "contains-word-"+token
features[featureName] = (token in question)
return features
but it seems the classifier decided to never tag a string as being a "php" question.
Even a simple string like: "is this a php question?" is being classified as False.
Can anyone help me understand this phenomenon?
Here is some partial code (I have 3 or 4 pages of code, so this is just a small part):
classifier = nltk.NaiveBayesClassifier.train(train_set)
cross_valid_accuracy = nltk.classify.accuracy(classifier, cross_valid_set)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(cross_valid_set):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'Precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'Recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])