How to pass list-like using .reindex as doing it in .loc has been deprecated? - python

I have a dataframe with multiple fields and I want to use some columns values to recreate a new dataframe as a JSON object:
Street City State Zip_Code
24 St. Kansas City KS 12345-213
... ... ... ....
In order to do so, I was using .loc and .apply like this in python:
def address_x(vals):
val = {
'street': None if not str(vals[0]) else vals[0],
'city': None if not str(vals[1]) else vals[1],
'state': None if not str(vals[2]) else state(vals[2]),
'postal_code': postal_code(str(vals[3]))
}
return val
def transform (dataset):
df = pd.DataFrame()
df['address'] = dataset.loc[['Street', 'City', 'State', 'Zip_Code']].apply(address_x, axis=1)
return df
obj = s3client.get_object(Bucket=bucket, Key=key)
new_df = transform(pd.read_csv(io.BytesIO(obj['Body'].read()), delimiter='|', sep='|'))
new_df.to_json('TEST.json', orient='records', lines=True)
That gives me this error message KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'
I am trying to use df['address'] = dataset.reindex(['STREET', 'CITY', 'STATE', 'ZIP CODE']).apply(lambda x: address_x(x)) but just stores all values as null instead of this:
{"address":{
"street": "24 St.",
"city": "Kansas City",
"state": "Kansas",
"postal_code": 12345-213}
}
The input is a regular csv file that is using '|' as separator and between all columns it has, this are just 4 of them in the example up.
Then I store it as a json and currently the output looks like: {"address":{"street":null,"city":null,"state":null,"postal_code":null}} for each record, instead of populating the json with the csv values.

Change to:
def address_x(vals):
val = {
'street': None if not str(vals['Street']) else vals['Street'],
'city': None if not str(vals['City']) else vals['City'],
'state': None if not str(vals['State']) else state(vals['State']),
'postal_code': postal_code(str(vals['Zip_Code']))
}
return val
df['address'] = dataset[['Street', 'City', 'State', 'Zip_Code']].apply(address_x, axis=1)

Related

Flattening multi nested json into a pandas dataframe

I'm trying to flatten this json response into a pandas dataframe to export to csv.
It looks like this:
j = [
{
"id": 401281949,
"teams": [
{
"school": "Louisiana Tech",
"conference": "Conference USA",
"homeAway": "away",
"points": 34,
"stats": [
{"category": "rushingTDs", "stat": "1"},
{"category": "puntReturnYards", "stat": "24"},
{"category": "puntReturnTDs", "stat": "0"},
{"category": "puntReturns", "stat": "3"},
],
}
],
}
]
...Many more items in the stats area.
If I run this and flatten to the teams level:
multiple_level_data = pd.json_normalize(j, record_path =['teams'])
I get:
school conference homeAway points stats
0 Louisiana Tech Conference USA away 34 [{'category': 'rushingTDs', 'stat': '1'}, {'ca...
How do I flatten it twice so that all of the stats are on their own column in each row?
If I do this:
multiple_level_data = pd.json_normalize(j, record_path =['teams'])
multiple_level_data = multiple_level_data.explode('stats').reset_index(drop=True)
multiple_level_data=multiple_level_data.join(pd.json_normalize(multiple_level_data.pop('stats')))
I end up with multiple rows instead of more columns:
You can try:
df = pd.DataFrame(j).explode("teams")
df = pd.concat([df, df.pop("teams").apply(pd.Series)], axis=1)
df["stats"] = df["stats"].apply(lambda x: {d["category"]: d["stat"] for d in x})
df = pd.concat(
[
df,
df.pop("stats").apply(pd.Series),
],
axis=1,
)
print(df)
Prints:
id school conference homeAway points rushingTDs puntReturnYards puntReturnTDs puntReturns
0 401281949 Louisiana Tech Conference USA away 34 1 24 0 3
can you try this:
multiple_level_data = pd.json_normalize(j, record_path =['teams'])
multiple_level_data = multiple_level_data.explode('stats').reset_index(drop=True)
multiple_level_data=multiple_level_data.join(pd.json_normalize(multiple_level_data.pop('stats')))
#convert rows to columns.
multiple_level_data=multiple_level_data.set_index(multiple_level_data.columns[0:4].to_list())
dfx=multiple_level_data.pivot_table(values='stat',columns='category',aggfunc=list).apply(pd.Series.explode).reset_index(drop=True)
multiple_level_data=multiple_level_data.reset_index().drop(['stat','category'],axis=1).drop_duplicates().reset_index(drop=True)
multiple_level_data=multiple_level_data.join(dfx)
Output:
school
conference
homeAway
points
puntReturnTDs
puntReturnYards
puntReturns
rushingTDs
0
Louisiana Tech
Conference USA
away
34
0
24
3
1
Instead of calling explode() on an output of a json_normalize(), you can explicitly pass the paths to the meta data for each column in a single json_normalize() call. For example, ['teams', 'school'] would be one path, ['teams', 'conference'] is another path, etc. This will create a long dataframe similar to what you already have.
Then you can call pivot() to reshape this output into the correct shape.
# normalize json
df = pd.json_normalize(
j, record_path=['teams', 'stats'],
meta=['id', *(['teams', c] for c in ('school', 'conference', 'homeAway', 'points'))]
)
# column name contains 'teams' prefix; remove it
df.columns = [c.split('.')[1] if '.' in c else c for c in df]
# pivot the intermediate result
df = (
df.astype({'points': int, 'id': int})
.pivot(['id', 'school', 'conference', 'homeAway', 'points'], 'category', 'stat')
.reset_index()
)
# remove index name
df.columns.name = None
df

How can I refactor my code to return a collection of dictionaries?

def read_data(service_client):
data = list_data(domain, realm) # This returns a data frame
building_data = []
building_names = {}
all_buildings = {}
for elem in data.iterrows():
building = elem[1]['building_name']
region_id = elem[1]['region_id']
bandwith = elem[1]['bandwith']
building_id = elem[1]['building_id']
return {
'Building': building,
'Region Id': region_id,
'Bandwith': bandwith,
'Building Id': building_id,
}
Basically I am able to return a single dictionary value upon a iteration here in this example. I have tried printing it as well and others.
I am trying to find a way to store multiple dictionary values on each iteration and return it, instead of just returning one.. Does anyone know any ways to achieve this?
You may replace your for-loop with the following to get all dictionaries in a list.
naming = {
'building_name': 'Building',
'region_id': 'Region Id',
'bandwith': 'Bandwith',
'building_id': 'Building Id',
}
return [
row[list(naming.values())].to_dict()
for idx, row in data.rename(naming, axis=1).iterrows()
]

How to transform JSON SList to pandas dataframe?

a = ['{"type": "book",',
'"title": "sometitle",',
'"author": [{"name": "somename"}],',
'"year": "2000",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '',
'{"type": "book",', '
'"title": "sometitle2",',
'"author": [{"name": "somename2"}],',
'"year": "2001",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '']
I have this convoluted SList and I would like to ultimately get it into a tidy pandas dataframe.
I have tried a number of things, for example:
i = iter(a)
b = dict(zip(i, i))
Unfortunately, this creates a dictionary that looks even worse:
{'{"type": "book",':
...
Where I had an SList of dictionaries, I now have a dictionary of dictionaries.
I also tried
pd.json_normalize(a)
but this throws an error message AttributeError: 'str' object has no attribute 'values'
I also tried
r = json.dumps(a.l)
loaded_r = json.loads(r)
print(loaded_r)
but this yields a list
['{"type": "book",',
...
Again, in the end I'd like to have a pandas dataframe like this
type title author year ...
book sometitle somename 2000 ...
book sometitle2 somename2 2001
Obviously, I haven't really gotten to the point where I can feed the data to a pandas function. Everytime I did that, the functions screamed at me...
a = ['{"type": "book",',
'"title": "sometitle",',
'"author": [{"name": "somename"}],',
'"year": "2000",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '',
'{"type": "book",',
'"title": "sometitle2",',
'"author": [{"name": "somename2"}],',
'"year": "2001",',
'"identifier": [{"type": "ISBN", "id": "1234567890"}],',
'"publisher": "somepublisher"}', '']
b = "[%s]" % ''.join([',' if i == '' else i for i in a ]).strip(',')
data = json.loads(b)
df = pd.DataFrame(data)
print(df)
type title author year \
0 book sometitle [{'name': 'somename'}] 2000
1 book sometitle2 [{'name': 'somename2'}] 2001
identifier publisher
0 [{'type': 'ISBN', 'id': '1234567890'}] somepublisher
1 [{'type': 'ISBN', 'id': '1234567890'}] somepublisher

How to write to excel sheet only those rows which match the condition using Python pandas

I have a data frame which contains 3 columns(Issue id, Creator, Versions).I need to extract the row which does not contain the value "<JIRA Version" in the "versions" column(Which is the third and fifth row in my case.Similarly there could be multiple rows in the data frame)
Below is the code i'm trying, but this is actually printing all the rows from the data frame. Any help/suggestions are appreciated.
allissues = []
for i in issues:
d = {
'Issue id': i.id,
'creator' : i.fields.creator,
'resolution': i.fields.resolution,
'status.name': i.fields.status.name,
'versions': i.fields.versions,
}
allissues.append(d)
df = pd.DataFrame(allissues, columns=['Issue id', 'creator', 'versions'])
matchers = ['<JIRA Version']
for ind in df.values:
if matchers not in df.values:
print(df['versions'][ind], df['Issue id'][ind])
some minor changes in your code:
allissues = []
for i in issues:
d = {
'Issue id': i.id,
'creator' : i.fields.creator,
'resolution': i.fields.resolution,
'status.name': i.fields.status.name,
'versions': i.fields.versions,
}
allissues.append(d)
df = pd.DataFrame(allissues, columns=['Issue id', 'creator', 'versions'])
matchers = '<JIRA Version'
for ind,row in df.iterrows():
if matchers not in row.versions:
print(row['versions'], row['Issue id'])

Read Dictionary and write to a text file

I have a dictionary now:
data = [{'position': 1, 'name':'player1:', 'number': 524}, {'position':2, 'name': 'player2:','number': 333}]
(just list two group of number first to simplify the problem)
I want to read and print it in the order of positions: "position 1", "position 2" ... "position n" in a text or csv file.
something like:
position name number
1 player1 524
2 player2 333
I tried:
data = [{'position': 1, 'name':'player1', 'number': 524}, {'position':2, 'name': 'player2:','number': 333}]
keys = data[0].keys()
with open(output.csv", 'r') as output_file:
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(data)
Seems like I should create a csv instead of open it first. Also, is there any better ways? Thanks.
The easiest thing to do would probably be to read it into a pandas Dataframe and then write it to a csv.
import pandas as pd
data = [
{
'position': 1,
'name':'player1',
'number': 524
}, {
'position': 2,
'name': 'player2',
'number': 333
}
]
df = pd.DataFrame.from_records(data, columns=['position', 'name', 'number'])
df = df.sort_values('position')
df.to_csv('data.csv')
use pandas
import pandas as pd
data = [
{
'position': 1,
'name':'player1:',
'number': 524
}, {
'position':2,
'name':'player2:',
'number': 333
}
]
df = pd.DataFrame.from_records(data, columns=['position', 'name', 'number'])
df = df.sort_values('position')
df.head()

Categories

Resources