Replace list of items in Pandas dataframe by tree leafs - python

I have a tree Locations which is has a Continent -> Country -> Location hierarchy. Now I have a dataframe which per row has a list entries of this tree.
How can I replace the entries of the list per row by the leaf's its tree.
My creativity in apply or map and possible a lambda function is lacking.
Minimal example;
import pandas as pd
Locations = {
'Europe':
{'Germany': ['Berlin'],
'France': ['Paris','Bordeaux']},
'Asia':
{'China': ['Hong Kong'],
'Indonesia': ['Jakarta']},
'North America':
{'United States':['New York','Washington']}}
df = pd.DataFrame({'Persons': ['A', 'B'], 'Locations': [
['North America','United States','Asia','France'],
['North America','Asia','Europe','Germany']]})
df = df.apply(...)?
df = df.map(...)?
# How to end up with:
pd.DataFrame({'Persons': ['A', 'B'], 'Locations': [
['New York','Washington','Hong Kong','Jakarta','Paris','Bordeaux'],
['New York','Washington','Hong Kong','Jakarta','Paris','Bordeaux','Berlin']]})
# Note the order of the locations doesn't matter is also OK
pd.DataFrame({'Persons': ['A', 'B'], 'Locations': [
['Jakarta','Washington','Hong Kong','Paris','New York','Bordeaux'],
['Jakarta','Berlin','Washington','Hong Kong','Paris','New York','Bordeaux']]})

You do not really need the apply method. You can start by changing the structure of your Locations dictionary in order to map the actual values to your exploded data frame. Then, just combine several explode, drop_duplicates and groupby statements with different aggregation logics to produce your desired result.
Code:
import pandas as pd
from collections import defaultdict
from itertools import chain
Locations = {
'Europe':{'Germany': ['Berlin'], 'France': ['Paris','Bordeaux']},
'Asia': {'China': ['Hong Kong'], 'Indonesia': ['Jakarta']},
'North America': {'United States': ['New York','Washington']}
}
df = pd.DataFrame({'Persons':['A', 'B'], 'Locations': [['North America','United States','Asia','France'], ['North America','Asia','Europe']]})
mapping_locs = defaultdict(list)
for key, val in Locations.items():
mapping_locs[key] = list(chain.from_iterable(list(val.values())))
for lkey, lval in val.items():
mapping_locs[lkey] = lval
(
df.assign(
mapped_locations=(
df.explode("Locations")["Locations"].map(mapping_locs).reset_index()
.explode("Locations").drop_duplicates(subset=["index", "Locations"])
.groupby(level=0).agg({"index": "first", "Locations": list})
.groupby("index").apply(lambda x: list(chain.from_iterable(x["Locations"])))
)
)
)
Output:
Persons Locations mapped_locations
0 A [North America, United States, Asia, France] [New York, Washington, Hong Kong, Jakarta, Par...
1 B [North America, Asia, Europe] [New York, Washington, Hong Kong, Jakarta, Ber...

Related

Using regex in python for a dynamic string

I have a pandas columns with strings which dont have the same pattern, something like this:
{'iso_2': 'FR', 'iso_3': 'FRA', 'name': 'France'}
{'iso': 'FR', 'iso_2': 'USA', 'name': 'United States of America'}
{'iso_3': 'FR', 'iso_4': 'FRA', 'name': 'France'}
How do I only keep the name of the country for every row? I would only like to keep "France", "United States of America", "France".
I tried building the regex pattern: something like this
r"^\W+[a-z]+_[0-9]\W+"
But this turns out to be very specific, and if there is a slight change in the string the pattern wont work. How do we resolve this?
As you have dictionaries in the column, you can get the values of the name keys:
import pandas as pd
df = pd.DataFrame({'col':[{'iso_2': 'FR', 'iso_3': 'FRA', 'name': 'France'},
{'iso': 'FR', 'iso_2': 'USA', 'name': 'United States of America'},
{'iso_3': 'FR', 'iso_4': 'FRA', 'name': 'France'}]})
df['col'] = df['col'].apply(lambda x: x["name"])
Output of df['col']:
0 France
1 United States of America
2 France
Name: col, dtype: object
If the column contains stringified dictionaries, you can use ast.literal_eval before accessing the name key value:
import pandas as pd
import ast
df = pd.DataFrame({'col':["{'iso_2': 'FR', 'iso_3': 'FRA', 'name': 'France'}",
"{'iso': 'FR', 'iso_2': 'USA', 'name': 'United States of America'}",
"{'iso_3': 'FR', 'iso_4': 'FRA', 'name': 'France'}"]})
df['col'] = df['col'].apply(lambda x: ast.literal_eval(x)["name"])
And in case your column is totally messed up, yes, you can resort to regex:
df['col'] = df['col'].str.extract(r"""['"]name['"]\s*:\s*['"]([^"']+)""")
# or to support escaped " and ':
df['col'] = df['col'].str.extract(r"""['"]name['"]\s*:\s*['"]([^"'\\]+(?:\\.[^'"\\]*)*)""")>>> df['col']
0
0 France
1 United States of America
2 France
See the regex demo.

Matching part of a string with a value in two pandas dataframes

Given the following df with street names:
df = pd.DataFrame({'street1': ['36 Angeles', 'New York', 'Rice Street', 'Levitown']})
And df2 which contains that match streets and their following county:
df2 = pd.DataFrame({'street2': ['Angeles', 'Caguana', 'Levitown'], 'county': ["Utuado", "Utuado", "Bayamon"]})
How can I create a column that tells me the state where each street of DF is, through a pairing of df(street) df2(street2). The matching does not have to be perfect, it must match at least one word?
The following dataframe is an example of what I want to obtain:
desiredoutput = pd.DataFrame({'street1': ['36 Angeles', 'New York', 'Rice Street', 'Levitown'], 'state': ["Utuado", "NA", "NA", "Bayamon"]})
Maybe a Naive approach, but works well.
df = pd.DataFrame({'street1': ['36 Angeles', 'New York', 'Rice Street', 'Levitown']})
df2 = pd.DataFrame({'street2': ['Angeles', 'Caguana', 'Levitown'], 'county': ["Utuado", "Utuado", "Bayamon"]})
output = {'street1':[],'county':[]}
streets1 = df['street1']
streets2 = df2['street2']
county = df2['county']
for street in streets1:
for index,street2 in enumerate(streets2):
if street2 in street:
output['street1'].append(street)
output['county'].append(county[index])
count = 1
if count == 0:
output['street1'].append(street)
output['county'].append('NA')
count = 0
print(output)

Select Rows where one cell contains 'some' word and store in a variable

I'm trying to analyze StackOverflow Survey Data which can be found here.
I want to get all rows which contains in "United States" in "Country" column and then store the values in a variable.
For example =
my_data = `{'Age1stCode': 12, 13, 15, 16, 18
'Country': 'India', 'China', 'United States', 'England', 'United States'
'x': 'a', 'b', 'c', 'd', 'e'
}`
what I want =
my_result_data = `{'Age1stCode': 15, 18
'Country': 'United States', 'United States'
'x': 'c', 'e'
}`
If you need to filter or search for specific string in specific column in dataframe
you need just to use
df_query = df.loc[ df.country.str.contains('United States')]
you can set case = True to make the search case sensitive
check doc for more infos : pandas doc

How to convert dataframe to nested dictionary?

I am running Python 3.7 and Pandas 1.1.3 and have a DataFrame which looks like this:
location = {'city_id': [22000,25000,27000,35000],
'population': [3971883,2720546,667137,1323],
'region_name': ['California','Illinois','Massachusetts','Georgia'],
'city_name': ['Los Angeles','Chicago','Boston','Boston'],
}
df = pd.DataFrame(location, columns = ['city_id', 'population','region_name', 'city_name'])
I want to transform this dataframe into a dict that looks like:
{
'Boston': {'Massachusetts': 27000, 'Georgia': 35000},
'Chicago': {'Illinois': 25000},
'Los Angeles': {'California': 22000}
}
And if the same cities in different regions, nested JSON should be sorted by population (for example Boston is in Massachusetts and Georgia. The city in Massachusetts is bigger, we output it first.
My code is:
result = df = df.groupby(['city_name'])[['region_name','city_id']].apply(lambda x: x.set_index('region_name').to_dict()).to_dict()
Output:
{'Boston': {'city_id': {'Massachusetts': 27000, 'Georgia': 35000}},
'Chicago': {'city_id': {'Illinois': 25000}},
'Los Angeles': {'city_id': {'California': 22000}}}
how can you see to dictionary add key - "city_id"
Tell me, please, how I should change my code that gets the expected result?
just method chain apply() method to your current solution:
result=df.groupby(['city_name'])[['region_name','city_id']].apply(lambda x: x.set_index('region_name').to_dict()).apply(lambda x:list(x.values())[0]).to_dict()
Now if you print result you will get your expected output:
{'Boston': {'Massachusetts': 27000, 'Georgia': 35000},
'Chicago': {'Illinois': 25000},
'Los Angeles': {'California': 22000}}

How to turn a list of a list of dictionaries into a dataframe via loop

I have a list of a list of dictionaries. I managed to access each list-element within the outer list and convert the dictionary via pandas into a data-frame. I then save the DF and later concat it. That's a perfect result. But I need a loop to do that for big data.
Here is my MWE which works fine in principle.
import pandas as pd
mwe = [
[{"name": "Norway", "population": 5223256, "area": 323802.0, "gini": 25.8}],
[{"name": "Switzerland", "population": 8341600, "area": 41284.0, "gini": 33.7}],
[{"name": "Australia", "population": 24117360, "area": 7692024.0, "gini": 30.5}],
]
df0 = pd.DataFrame.from_dict(mwe[0])
df1 = pd.DataFrame.from_dict(mwe[1])
df2 = pd.DataFrame.from_dict(mwe[2])
frames = [df0, df1, df2]
result = pd.concat(frames)
It creates a nice table.
Here is what I tried to create a list of data frames:
for i in range(len(mwe)):
frame = pd.DataFrame()
frame = pd.DataFrame.from_dict(mwe[i])
frames = []
frames.append(frame)
Addendum: Thanks for all the answers. They are working on my MWE. Which made me notice that there are some strange entries in my dataset. No solution works for my dataset, since I have an inner-list element which contains two dictionaries (due to non unique data retrieval):
....
[{'name': 'United States Minor Outlying Islands', 'population': 300},
{'name': 'United States of America',
'population': 323947000,
'area': 9629091.0,
'gini': 48.0}],
...
How can I drop the entry for "United States Minor Outlying Islands"?
You could get each dict out of the containing list and just have a list of dict:
import pandas as pd
mwe = [[{'name': 'Norway', 'population': 5223256, 'area': 323802.0, 'gini': 25.8}],
[{'name': 'Switzerland',
'population': 8341600,
'area': 41284.0,
'gini': 33.7}],
[{'name': 'Australia',
'population': 24117360,
'area': 7692024.0,
'gini': 30.5}]]
# use x.pop() so that you aren't carrying around copies of the data
# for a "big data" application
df = pd.DataFrame([x.pop() for x in mwe])
df.head()
area gini name population
0 323802.0 25.8 Norway 5223256
1 41284.0 33.7 Switzerland 8341600
2 7692024.0 30.5 Australia 24117360
By bringing the list comprehension into the dataframe declaration, that list is temporary, and you don't have to worry about the cleanup. pop will also consume the dictionaries out of mwe, minimizing the amount of copies you are carrying around in memory
As a note, when doing this, mwe will then look like:
mwe
[[], [], []]
Because the contents of the sub-lists have been popped out
EDIT: New Question Content
If your data contains duplicates, or at least entries you don't want, and the undesired entries don't have matching columns to the rest of the dataset (which appears to be the case), it becomes a bit trickier to avoid copying data as above:
mwe.append([{'name': 'United States Minor Outlying Islands', 'population': 300}, {'name': 'United States of America', 'population': 323947000, 'area': 9629091.0, 'gini': 48.0}])
key_check = {}.fromkeys(["name", "population", "area", "gini"])
# the easy way but copies data
df = pd.DataFrame([item for item in data
for data in mwe
if item.keys()==key_check.keys()])
Since you'll still have the data hanging around in mwe. It might be better to use a generator
def get_filtered_data(mwe):
for data in mwe:
while data: # when data is empty, the while loop will end
item = data.pop() # still consumes data out of mwe
if item.keys() == key_check.keys():
yield item # will minimize data copying through lazy evaluation
df = pd.DataFrame([x for x in get_filtered_data(mwe)])
area gini name population
0 323802.0 25.8 Norway 5223256
1 41284.0 33.7 Switzerland 8341600
2 7692024.0 30.5 Australia 24117360
3 9629091.0 48.0 United States of America 323947000
Again, this is under the assumption that non-desired entries have invalid columns, which appears to be the case here, specifically. Otherwise, this will at least flatten out the data structure so you can filter it with pandas later
Create and empty DataFrame and loop over the list using df.append on each loop:
>>> import pandas as pd
mwe = [[{'name': 'Norway', 'population': 5223256, 'area': 323802.0, 'gini': 25.8}],
[{'name': 'Switzerland',
'population': 8341600,
'area': 41284.0,
'gini': 33.7}],
[{'name': 'Australia',
'population': 24117360,
'area': 7692024.0,
'gini': 30.5}]]
>>> df = pd.DataFrame()
>>> for country in mwe:
... df = df.append(country)
...
>>> df
area gini name population
0 323802.0 25.8 Norway 5223256
0 41284.0 33.7 Switzerland 8341600
0 7692024.0 30.5 Australia 24117360
Try this :
df = pd.DataFrame(columns = ['name', 'population', 'area', 'gini'])
for i in range(len(mwe)):
df.loc[i] = list(mwe[i][0].values())
Output :
name pop area gini
0 Norway 5223256 323802.0 25.8
1 Switzerland 8341600 41284.0 33.7
2 Australia 24117360 7692024.0 30.5

Categories

Resources