I have the next DataFrame:
a = [{'order': '789', 'name': 'A', 'date': 20220501, 'sum': 15.1}, {'order': '456', 'name': 'A', 'date': 20220501, 'sum': 19}, {'order': '704', 'name': 'B', 'date': 20220502, 'sum': 14.1}, {'order': '704', 'name': 'B', 'date': 20220502, 'sum': 22.9}, {'order': '700', 'name': 'B', 'date': 20220502, 'sum': 30.1}, {'order': '710', 'name': 'B', 'date': 20220502, 'sum': 10.5}]
df = pd.DataFrame(a)
print(df)
I need, to distinct (count) value in column order and to add values to the new column order_count, grouping by columns name and date, sum values in column sum.
I need to get the next result:
In your case do
out = df.groupby(['name','date'],as_index=False).agg({'sum':'sum','order':'nunique'})
Out[652]:
name date sum order
0 A 20220501 34.1 2
1 B 20220502 77.6 3
import pandas as pd
df[['name','date','sum']].groupby(by=['name','date']).sum().reset_index().rename(columns={'sum':'order_count'}).join(df[['name','date','sum']].groupby(by=['name','date']).count().reset_index().drop(['name','date'],axis=1))
I have a list of dictionaries like so:
[{'asset': 'Discovery Fund', 'amount': 100000, 'rating': 'High'},
{'asset': 'Ethical Fund', 'amount': 200000, 'rating': 'High'},
{'asset': 'Foreign Stocks', 'amount': 9350000, 'rating': 'Very High'},
{'asset': 'Local Stocks', 'amount': 550000, 'rating': 'Very High'}]
I'm trying to merge any duplicate values for the rating key and sum up the amounts.
For example, the list above has ratings value as "High", "High", "Very High", "Very High"
Expected result:
[
{'amount': 300000, 'rating': 'High'},
{'amount': 900000, 'rating': 'Very High'}
]
Please how can I go about this?
Any help would be appreciated.
Thanks
You can achieve this quite easily using pandas.
import pandas as pd
data = [{'asset': 'Discovery Fund', 'amount': 100000, 'rating': 'High'},
{'asset': 'Ethical Fund', 'amount': 200000, 'rating': 'High'},
{'asset': 'Foreign Stocks', 'amount': 9350000, 'rating': 'Very High'},
{'asset': 'Local Stocks', 'amount': 550000, 'rating': 'Very High'}]
df = pd.DataFrame(data)
output = df.groupby('rating', as_index=False).sum().to_dict('records')
print(output)
#[{'rating': 'High', 'amount': 300000}, {'rating': 'Very High', 'amount': 9900000}]
To do this without pandas we can use a loop over our list and dict.get() to grab the value, or 0 for key rating.
output = {}
for _dict in data:
output[_dict['rating']] = output.get(_dict['rating'], 0) + _dict['amount']
output
#{'High': 300000, 'Very High': 9900000}
I have dataframe which has many rows.
How can I make this upper dataframe as below which has one rows.
import pandas as pd
# source dataframe
df_source = pd.DataFrame({
'ID': ['A01', 'A01'],
'Code': ['101', '102'],
'amount for code': [10000, 20000],
'count for code': [4, 3]
})
# target dataframe
df_target = pd.DataFrame({
'ID': ['A01'],
'Code101': [1],
'Code102': [1],
'Code103': [0],
'amount for code101': [10000],
'count for code101': [4],
'amount for code102': [20000],
'count for code102': [3],
'amount for code103': [None],
'count for code103': [None],
'count for code': [None],
'sum of amount': [30000],
'sum of count': [7]
})
I tried to use method 'get.dummies' but It can be used only for there was that code or not.
How can I handle dataframe to make my dataset?
You can iterate through the rows of your existing dataframe and populate (using .at or .loc) your new dataframe (df2). df2 will have the index ID, which is now unique.
import pandas as pd
df = pd.DataFrame({
'ID': ['A01', 'A01'],
'Code': ['101', '102'],
'amount for code': [10000, 20000],
'count for code': [4, 3]
})
df2 = pd.DataFrame()
for idx, row in df.iterrows():
for col in df.columns:
if col !='ID' and col !='Code':
df2.at[row['ID'],col+row['Code']]=row[col]
You can use pivot_table:
df_result = df.pivot_table(index='ID', columns='Code', values=['amount for code', 'amount for code'])
This will return a data frame with multi-level column index, for example ('101', 'amount for code')
Then you can add other calculated columns like sum of amount and so on.
I have a Pandas Dataframe where the column 'items' is a dictionary and shows per transaction which products have been bought:
data = {'price':[40, 15, 10, 2],
'items': ["{'product': 'Product1', 'quantity': 4, 'product': 'Product2', 'quantity': 1}", "{'product': 'Product2', 'quantity': 1, 'product': 'Product3', 'quantity': 1,'product': 'Product1', 'quantity': 1}", "{'product': 'Product1', 'quantity': 4}", "{'product': 'Product3', 'quantity': 1, 'product': 'Product1', 'quantity': 1}"]
}
df = pd.DataFrame (data, columns = ['price', 'items'])
I want to find out which products have been bought most. In this case the result should look like:
Product1: 4
Product2: 2
How can I count the most frequent values of the key 'product' within the column 'items'?
Perhaps you could use a namedtuple (from the built-in collections package).
First, define a named tuple called Record create a list of these:
from collections import namedtuple
import pandas as pd
Record = namedtuple('Record', 'price product quantity')
records = [
Record(40, 'Product1', 4), Record(40, 'Product2', 1),
Record(15, 'Product2', 1), Record(15, 'Product3', 1), Record(15, 'Product1', 1),
Record(10, 'Product1', 4),
Record( 2, 'Product3', 1), Record(2, 'Product1', 1),]
Second, create the data frame, and use groupby to compute number of each product:
# create data frame
df = pd.DataFrame(records)
# compute summary statistic
df = df.groupby('product')['quantity'].sum()
print(df)
product
Product1 10
Product2 2
Product3 2
Name: quantity, dtype: int64
I did not match your expected results. Sorry if I misunderstood your data and/or question.
Given these data frames:
IncomingCount
-------------------------
Venue|Date | 08 | 10 |
-------------------------
Hotel|20190101| 15 | 03 |
Beach|20190101| 93 | 45 |
OutgoingCount
-------------------------
Venue|Date | 07 | 10 |
-------------------------
Beach|20190101| 30 | 5 |
Hotel|20190103| 05 | 15 |
How can I possibly merge (full join) the two tables resulting in something as following without having to manually loop through each row of both tables?
Dictionary:
[
{"Venue":"Hotel", "Date":"20190101", "08":{ "IncomingCount":15 }, "10":{ "IncomingCount":03 } },
{"Venue":"Beach", "Date":"20190101", "07":{ "OutgoingCount":30 }, "08":{ "IncomingCount":93 }, "10":{ "IncomingCount":45, "OutgoingCount":15 } },
{"Venue":"Hotel", "Date":"20190103", "07":{ "OutgoingCount":05 }, "10":{ "OutgoingCount":15 } }
]
The conditions are:
Venue and Date columns act like join conditions.
The other columns, represented in numbers, are dynamically created.
If dynamically column does not exist, it gets excluded( or included with None as value ).
it's pretty fiddly, but it can be done by making use of the create_map function from spark.
basically divide the columns into four groups: keys (venue, date), common (10), only incoming (08), only outgoing (07).
then create mappers per group (except keys), mapping only what's available per group. apply mapping, drop the old column and rename the mapped column to the old name.
lastly convert all rows to dict (from df's rdd) and collect.
from pyspark.sql import SparkSession
from pyspark.sql.functions import create_map, col, lit
spark = SparkSession.builder.appName('hotels_and_beaches').getOrCreate()
incoming_counts = spark.createDataFrame([('Hotel', 20190101, 15, 3), ('Beach', 20190101, 93, 45)], ['Venue', 'Date', '08', '10']).alias('inc')
outgoing_counts = spark.createDataFrame([('Beach', 20190101, 30, 5), ('Hotel', 20190103, 5, 15)], ['Venue', 'Date', '07', '10']).alias('out')
df = incoming_counts.join(outgoing_counts, on=['Venue', 'Date'], how='full')
outgoing_cols = {c for c in outgoing_counts.columns if c not in {'Venue', 'Date'}}
incoming_cols = {c for c in incoming_counts.columns if c not in {'Venue', 'Date'}}
common_cols = outgoing_cols.intersection(incoming_cols)
outgoing_cols = outgoing_cols.difference(common_cols)
incoming_cols = incoming_cols.difference(common_cols)
for c in common_cols:
df = df.withColumn(
c + '_new', create_map(
lit('IncomingCount'), col('inc.{}'.format(c)),
lit('OutgoingCount'), col('out.{}'.format(c)),
)
).drop(c).withColumnRenamed(c + '_new', c)
for c in incoming_cols:
df = df.withColumn(
c + '_new', create_map(
lit('IncomingCount'), col('inc.{}'.format(c)),
)
).drop(c).withColumnRenamed(c + '_new', c)
for c in outgoing_cols:
df = df.withColumn(
c + '_new', create_map(
lit('OutgoingCount'), col('out.{}'.format(c)),
)
).drop(c).withColumnRenamed(c + '_new', c)
result = df.coalesce(1).rdd.map(lambda r: r.asDict()).collect()
print(result)
result:
[{'Venue': 'Hotel', 'Date': 20190101, '10': {'OutgoingCount': None, 'IncomingCount': 3}, '08': {'IncomingCount': 15}, '07': {'OutgoingCount': None}}, {'Venue': 'Hotel', 'Date': 20190103, '10': {'OutgoingCount': 15, 'IncomingCount': None}, '08': {'IncomingCount': None}, '07': {'OutgoingCount': 5}}, {'Venue': 'Beach', 'Date': 20190101, '10': {'OutgoingCount': 5, 'IncomingCount': 45}, '08': {'IncomingCount': 93}, '07': {'OutgoingCount': 30}}]
I can get this so far:
import pandas as pd
import numpy as np
dd1 = {'venue': ['hotel', 'beach'], 'date':['20190101', '20190101'], '08': [15, 93], '10':[3, 45]}
dd2 = {'venue': ['beach', 'hotel'], 'date':['20190101', '20190103'], '07': [30, 5], '10':[5, 15]}
df1 = pd.DataFrame(data=dd1)
df2 = pd.DataFrame(data=dd2)
df1.columns = [f"IncomingCount:{x}" if x not in ['venue', 'date'] else x for x in df1.columns]
df2.columns = [f"OutgoingCount:{x}" if x not in ['venue', 'date'] else x for x in df2.columns ]
ll_dd = pd.merge(df1, df2, on=['venue', 'date'], how='outer').to_dict('records')
ll_dd = [{k:v for k,v in dd.items() if not pd.isnull(v)} for dd in ll_dd]
OUTPUT:
[{'venue': 'hotel',
'date': '20190101',
'IncomingCount:08': 15.0,
'IncomingCount:10': 3.0},
{'venue': 'beach',
'date': '20190101',
'IncomingCount:08': 93.0,
'IncomingCount:10': 45.0,
'OutgoingCount:07': 30.0,
'OutgoingCount:10': 5.0},
{'venue': 'hotel',
'date': '20190103',
'OutgoingCount:07': 5.0,
'OutgoingCount:10': 15.0}]
The final result as desired by the OP is a list of dictionaries, where all rows from the DataFrame which have same Venue and Date have been clubbed together.
# Creating the DataFrames
df_Incoming = sqlContext.createDataFrame([('Hotel','20190101',15,3),('Beach','20190101',93,45)],('Venue','Date','08','10'))
df_Incoming.show()
+-----+--------+---+---+
|Venue| Date| 08| 10|
+-----+--------+---+---+
|Hotel|20190101| 15| 3|
|Beach|20190101| 93| 45|
+-----+--------+---+---+
df_Outgoing = sqlContext.createDataFrame([('Beach','20190101',30,5),('Hotel','20190103',5,15)],('Venue','Date','07','10'))
df_Outgoing.show()
+-----+--------+---+---+
|Venue| Date| 07| 10|
+-----+--------+---+---+
|Beach|20190101| 30| 5|
|Hotel|20190103| 5| 15|
+-----+--------+---+---+
The idea is to create a dictionary from each row and have the all rows of the DataFrame stored as dictionaries in one big list. And as a final step, we club those dictionaries together which have same Venue and Date.
Since, all rows in the DataFrame are stored as Row() objects, we use collect() function to return all records as list of Row(). Just to illustrate the output -
print(df_Incoming.collect())
[Row(Venue='Hotel', Date='20190101', 08=15, 10=3), Row(Venue='Beach', Date='20190101', 08=93, 10=45)]
But, since we want list of dictionaries, we can use list comprehensions to convert them to a one -
list_Incoming = [row.asDict() for row in df_Incoming.collect()]
print(list_Incoming)
[{'10': 3, 'Date': '20190101', 'Venue': 'Hotel', '08': 15}, {'10': 45, 'Date': '20190101', 'Venue': 'Beach', '08': 93}]
But, since the numeric columns have been in the form like "08":{ "IncomingCount":15 }, instead of "08":15, so we employ dictionary comprehensions to convert them into this form -
list_Incoming = [ {k:v if k in ['Venue','Date'] else {'IncomingCount':v} for k,v in dict_element.items()} for dict_element in list_Incoming]
print(list_Incoming)
[{'10': {'IncomingCount': 3}, 'Date': '20190101', 'Venue': 'Hotel', '08': {'IncomingCount': 15}}, {'10': {'IncomingCount': 45}, 'Date': '20190101', 'Venue': 'Beach', '08': {'IncomingCount': 93}}]
Similarly, we do for OutgoingCount
list_Outgoing = [row.asDict() for row in df_Outgoing.collect()]
list_Outgoing = [ {k:v if k in ['Venue','Date'] else {'OutgoingCount':v} for k,v in dict_element.items()} for dict_element in list_Outgoing]
print(list_Outgoing)
[{'10': {'OutgoingCount': 5}, 'Date': '20190101', 'Venue': 'Beach', '07': {'OutgoingCount': 30}}, {'10': {'OutgoingCount': 15}, 'Date': '20190103', 'Venue': 'Hotel', '07': {'OutgoingCount': 5}}]
Final Step: Now, that we have created the requisite list of dictionaries, we need to club the list together on the basis of Venue and Date.
from copy import deepcopy
def merge_lists(list_Incoming, list_Outgoing):
# create dictionary from list_Incoming:
dict1 = {(record['Venue'], record['Date']): record for record in list_Incoming}
#compare elements in list_Outgoing to those on list_Incoming:
result = {}
for record in list_Outgoing:
ckey = record['Venue'], record['Date']
new_record = deepcopy(record)
if ckey in dict1:
for key, value in dict1[ckey].items():
if key in ('Venue', 'Date'):
# Do not merge these keys
continue
# Dict's "setdefault" finds a key/value, and if it is missing
# creates a new one with the second parameter as value
new_record.setdefault(key, {}).update(value)
result[ckey] = new_record
# Add values from list_Incoming that were not matched in list_Outgoing:
for key, value in dict1.items():
if key not in result:
result[key] = deepcopy(value)
return list(result.values())
res = merge_lists(list_Incoming, list_Outgoing)
print(res)
[{'10': {'OutgoingCount': 5, 'IncomingCount': 45},
'Date': '20190101',
'Venue': 'Beach',
'08': {'IncomingCount': 93},
'07': {'OutgoingCount': 30}
},
{'10': {'OutgoingCount': 15},
'Date': '20190103',
'Venue': 'Hotel',
'07': {'OutgoingCount': 5}
},
{'10': {'IncomingCount': 3},
'Date': '20190101',
'Venue': 'Hotel',
'08': {'IncomingCount': 15}
}]