Count and remove duplicates in keys while preserving values

Count and remove duplicates in keys while preserving values - python

I have collated some data and made them into a dictionary as follows:
gen_dict = {
"item_C_v001" : "jack",
"item_C_v002" : "kris",
"item_A_v003" : "john",
"item_B_v006" : "peter",
"item_A_v005" : "john",
"item_A_v004" : "dave"
}
I am trying to print out the results in the following format:
Item Name | No. of Vers. | User
item_A | 3 | dave, john
item_B | 1 | peter
item_C | 2 | jack, kris
where it will tabulates similar versions into 1 line, while counting how many versions there are and at the same time, stating the user names..
I am having trouble of integrating in the user names. I used the set() command, and that seems to apply for all my 3 rows of output.
Even so, while my 'Item Name' and 'no. of Vers.' column does seems correct, are there any ways in which I can check if the number of versions it found does adhere to the name? It is possible for me to count it manually if I have a small data but what if I got big data?
strip_ver_list = []
user_list = []
for item_name, user in gen_dict.iteritems():
# Strip out the version digits
strip_ver = item_name[:-3]
strip_ver_list.append(strip_ver)
user_list.append(user)
# This will count and remove the duplicates
versions_num = dict((duplicate, strip_ver_list.count(duplicate)) for duplicate in strip_ver_list)
for name, num in sorted(versions_num.iteritems()):
print "Version Name : {0}\nNo. of Versions : {1}\nUsers : {2}".format(name, num, set(user_list))
This is the ouput I have gotten:
Item Name | No. of Vers. | User
item_A | 3 | set(['dave', 'john', 'jack', 'kris', 'peter'])
item_B | 1 | set(['dave', 'john', 'jack', 'kris', 'peter'])
item_C | 2 | set(['dave', 'john', 'jack', 'kris', 'peter'])
This is the only method I can think up of.. But if there are any other viable methods to get around this, please do share with me

I would use a defaultdict to aggregate the data. Roughly:
>>> from collections import defaultdict
>>> gen_dict = {
... "item_C_v001" : "jack",
... "item_C_v002" : "kris",
... "item_A_v003" : "john",
... "item_B_v006" : "peter",
... "item_A_v005" : "john",
... "item_A_v004" : "dave"
... }
Now ...
>>> versions_num = defaultdict(lambda:dict(versions=set(), users = set()))
>>> for item_name, user in gen_dict.items():
... strip_ver = item_name[:-5]
... version_num = item_name[-3:]
... versions_num[strip_ver]['versions'].add(version_num)
... versions_num[strip_ver]['users'].add(user)
...
Finally,
>>> for item, data in versions_num.items():
... print("Item {} \tno. of Versions: {}\tUsers:{}".format(item, len(data['versions']), ",".join(data['users'])))
...
Item item_B no. of Versions: 1 Users:peter
Item item_A no. of Versions: 3 Users:john,dave
Item item_C no. of Versions: 2 Users:kris,jack
>>>
And if you want it sorted:
>>> for item, data in sorted(versions_num.items()):
... print("Item {} \tno. of Versions: {}\tUsers:{}".format(item, len(data['versions']), ",".join(data['users'])))
...
Item item_A no. of Versions: 3 Users:john,dave
Item item_B no. of Versions: 1 Users:peter
Item item_C no. of Versions: 2 Users:kris,jack

You need to group the lists by the item name and extract the users from each group, otherwise the user_list will always be a global list of users:
from itertools import groupby
# split the item_version
sorted_ver_num = sorted(k.rsplit("_", 1) + [v] for k, v in gen_dict.items())
# group the results by the item name
for k, g in groupby(sorted_ver_num, key = lambda x: x[0]):
# extract the user list within each group
# user_list = [user for *_, user in g]
user_list = [user for _, _, user in g]
print("Version Name : {0}\nNo. of Versions : {1}\nUsers : {2}".format(k, len(user_list), set(user_list)))
Version Name : item_A
No. of Versions : 3
Users : {'dave', 'john'}
Version Name : item_B
No. of Versions : 1
Users : {'peter'}
Version Name : item_C
No. of Versions : 2
Users : {'kris', 'jack'}

I would use a defaultdict to keep track of the users, and an ordinary dict to keep track of the count. The dict.get() method allows you to return a default value if the key is not found, in this case 0, and you just add 1 to it each time the key is found.
from collections import defaultdict
gen_dict = {
"item_C_v001" : "jack",
"item_C_v002" : "kris",
"item_A_v003" : "john",
"item_B_v006" : "peter",
"item_A_v005" : "john",
"item_A_v004" : "dave"
}
user_dict = defaultdict(set)
count_dict = {}
for item_name, user in gen_dict.iteritems():
user_dict[item_name[:-3]].add(user) # Sure you want -3 not -5?
count_dict[item_name[:-3]] = count_dict.get(item_name[:-3], 0) + 1
for name, num in sorted(count_dict.iteritems()):
print "Version Name : {0}\nNo. of Versions : {1}\nUsers : {2}".format(
name, num, ', '.join(item for item in user_dict[name]))

Example in IPython:
In [1]: gen_dict = {
...: "item_C_v001" : "jack",
...: "item_C_v002" : "kris",
...: "item_A_v003" : "john",
...: "item_B_v006" : "peter",
...: "item_A_v005" : "john",
...: "item_A_v004" : "dave"
...: }
Get the keys, we'll be needing them more then once.
In [2]: keys = tuple(gen_dict.keys())
Find the set of items.
In [3]: items = set(j[:-5] for j in keys)
Table header and template.
In [4]: header = 'Item Name | No. of Vers. | User'
In [5]: template = '{:14}|{:<15}|{}'
Print relevant information for all items.
In [6]: print(header)
Item Name | No. of Vers. | User
In [7]: for i in items:
...: relevant = tuple(j for j in keys if j.startswith(i))
...: users = set(gen_dict[x] for x in relevant)
...: print(template.format(i, len(relevant), ' '.join(users)))
...:
item_A |3 |john dave
item_B |1 |peter
item_C |2 |kris jack

Related

String content become random integer after using append()

I'm writing a function to filter tweet data that contains search word.
Here's my code:
def twitter_filter(df, search):
coun = 0
date_ls = []
id_ls = []
content_ls = []
lan_ls = []
name_ls = []
retweet_ls = []
cleaned_tweet_ls = []
for i, row in df.iterrows():
if search in row.cleaned_tweet:
date_ls.append(row.date)
id_ls.append(row.id)
content_ls.append(row.content)
lan_ls.append(row.language)
name_ls.append(row.name)
retweet_ls.append(row.retweet)
cleaned_tweet_ls.append(row.cleaned_tweet)
new_dict = {
"date": date_ls,
"id": id_ls,
"content": content_ls,
"lan" : lan_ls,
"name" : name_ls,
"retweet" : retweet_ls,
"cleaned_tweeet": cleaned_tweet_ls,
}
new_df = pd.DataFrame(new_dict)
return new_df
Before filter:
cleandf['name']
Out[6]:
0 PryZmRuleZZ
1 Arbitration111
2 4kjweed
3 THEREALCAMOJOE
5 DailyBSC_
130997 Rabbitdogebsc
130999 gmtowner
131000 topcryptostats
131001 vGhostvRiderv
131002 gmtowner
Name: name, Length: 98177, dtype: object
After filter, user's name becomes random integer:
cleanedogetweet['name']
Out[7]:
0 3
1 5
2 9
3 12
4 34
80779 130997
80780 130999
80781 131000
80782 131001
80783 131002
Name: name, Length: 80784, dtype: int64
This problem only happened in user's name columns, other columns that contains string are ok.
I expected to remain the original user name, how can i solve the problem ?

In pandas dataframes, each row has an attribute called name.
You can use the name attribute to get the name of the row. By default, the name of the row is the index of the row.
So it's better that your column name would not be name because it will conflict with the name attribute of the row.
You can use the rename method to rename the column name and use another name like username, or you can change your function to this:
def twitter_filter(df, search):
coun = 0
date_ls = []
id_ls = []
content_ls = []
lan_ls = []
name_ls = []
retweet_ls = []
cleaned_tweet_ls = []
for i, row in df.iterrows():
if search in row.cleaned_tweet:
date_ls.append(row['date'])
id_ls.append(row['id'])
content_ls.append(row['content'])
lan_ls.append(row['language'])
name_ls.append(row['name'])
retweet_ls.append(row['retweet'])
cleaned_tweet_ls.append(row['cleaned_tweet'])
new_dict = {
"date": date_ls,
"id": id_ls,
"content": content_ls,
"lan": lan_ls,
"user_name": name_ls,
"retweet": retweet_ls,
"cleaned_tweeet": cleaned_tweet_ls,
}
new_df = pd.DataFrame(new_dict)
return new_df

How can I print hello and all female members

test.json
{
"A Company":[{"female":["Jessica","Eve"]},{"male":["Mike","Peter"]}],
"B Company":[{"female":["Laura","Pamela"]},{"male":["Mark","Steve"]}]
}
test.py
import json
f = open('test.json',)
data = json.load(f)
for v in data.values():
for element in v:
print(element)
Output:
{'female': ['Jessica', 'Eve']}
{'male': ['Mike', 'Peter']}
{'female': ['Laura', 'Pamela']}
{'male': ['Mark', 'Steve']}
How can I print this: "Hello Jessica" "Hello Eve" "Hello Laura" "Hello Pamela"?

You can use an iterator to extract then names and a for-loop to print the greetings without building an intermediate list:
data = {
"A Company":[{"female":["Jessica","Eve"]},{"male":["Mike","Peter"]}],
"B Company":[{"female":["Laura","Pamela"]},{"male":["Mark","Steve"]}]
}
names = (name for groups in data.values()
for group in groups
for name in group.get("female",[]))
for name in names: print("Hello",name)
Hello Jessica
Hello Eve
Hello Laura
Hello Pamela

You missed the innermost loop, where you iterate the inner records and check if they are Males or Females.
Please see the example:
import json
json_file = """
{
"A Company":[{"female":["Jessica","Eve"]},{"male":["Mike","Peter"]}],
"B Company":[{"female":["Laura","Pamela"]},{"male":["Mark","Steve"]}]
}
"""
parsed = json.loads(json_file)
for val in parsed.values():
for record in val:
# This below is the innermost loop
for key, value in record.items():
# If it's female then we use list comprehension to print the greetings
if key == "female":
[print(f"Hello {name}") for name in value]

compare two lists of dictionaries for specific fields

I've two lists containing dictionaries. I want to compare certain fields in each of these dictionaries.
current_list = [{"name": "Bill","address": "Home", "age": 23, "accesstime":11:14:01},
{"name": "Fred","address": "Home", "age": 26, "accesstime":11:57:43},
{"name": "Nora","address": "Home", "age": 33, "accesstime":11:24:14}]
backup_list = [{"name": "Bill","address": "Home", "age": 23, "accesstime":13:34:24},
{"name": "Fred","address": "Home", "age": 26, "accesstime":13:34:26},
{"name": "Nora","address": "Home", "age": 33, "accesstime":13:35:14}]
The list / dictionaries should be the same in order, and i just want to compare certain key, value pairs. Like name, address, age and ignore access time, but what i have so far compares each key / pair. So i just want to compare
current_list:dictionary[0][name] -> backup_list:dictionary[0][name] and then
current_list:dictionary[0][address] -> backup_list:dictionary[0][address]
and so on.
for x in current_list:
for y in backup_list:
for k, v in x.items():
for kk, vv in y.items():
if k == kk:
print("Match: {0}".format(kk))
break
elif k != kk:
print("No match: {0}".format(kk))
Current output
Match name with name
No Match address with name
Match address with address
No Match age with name
No Match age with address
Match age with age
No Match dateRegistered with name
No Match dateRegistered with address
No Match dateRegistered with age
Match dateRegistered with dateRegistered
Preferred output
Match name with name
Match address with address
Match age with age
* Due to a requirement change my list became a list of Elementtree xml elements *
So instead of the above list, its becomes
backup_list = ["<Element 'New' at 0x0000000002698C28>, <Element 'Update' at 0x0000000002698CC8>, <Element 'New' at 0x0000000002698CC8>"]
Where the ElementTree is an xml element containing:
{"name": "Nora", "address": "Home", "age": 33, "dateRegistered": 20140812}"
So this based on the answer below seems to satisfy my requirements so far:
value_to_compare = ["name", "address", "age"]
for i, elem in enumerate(current_list):
backup_dict = backup_list[i]
if elem.tag == "New":
for key in value_to_compare:
try:
print("Match {0} {1} == {2}:".format(key, backup_dict.attrib[key], elem.attrib[key]))
except KeyError:
print("key {} not found".format(key))
except:
raise
else:
continue

I don't know if I fully understood your question but I think the following code should do the trick:
compare_arguments = ["name", "age", "address"]
for cl, bl in zip(current_list, backup_list):
for ca in compare_arguments:
if cl[ca] == bl[ca]:
print("Match {0} with {0}".format(cl[ca]))
print("-" * 10)
What is done in the code above is a zip iteration over both lists. With another list you specify the fields you want to compare. In the main loop you iterate over the comparable fields and print them accordingly.

Someone has already made a module called deepdiff that does this and sooo much more! Refer to this answer for their detailed explanation!
First - install it
pip install deepdiff
Then - enjoy
#of course import it
from deepdiff import DeepDiff
current_list, backup_list = [...], [...] #values stated in question.
for c, b in zip(current_list, backup_list):
dif = DeepDiff(c, b)
for key in ["name", "age", "address"]:
try:
assert dif['values_changed'][f"root['{key}'"]
#pass the below line to exclude any non-matching values like your desired output has
print(f"No Match {key} with {key}")
except KeyError:
print(f"Match {key} with {key}")
Results: - as expected
Match name with name
Match address with address
Match age with age
Match name with name
Match address with address
Match age with age
Match name with name
Match address with address
Match age with age
Final Note
This module has soo much else you can utilize such as type changes, key changes/removals/additions, an extensive text comparison, and searches as well. Definitely well worth a look into.
~GL on your project!

Simply compare with this-
for current in current_list:
for backup in backup_list:
for a in backup:
for b in current:
if a == b:
if a == "name" or a== "age" or a== "address" :
if backup[a] == current[b]:
print (backup[a])
print (current[b])

I do not understand the rationnal of your data structure, but I think that will do the trick:
value_to_compare = ["name", "address", "age"]
for i, elem in enumerate(current_list):
backup_dict = backup_list[i]
for key in value_to_compare:
try:
print("Match {}: {} with {}".format(key, elem[key], backup_dict[key]))
except KeyError:
print("key {} not found".format(key))
# may be a raise here.
except:
raise

You can compare all corresponding fields with this code:
for dct1, dct2 in zip(current_list, backup_list):
for k, v in dct1.items():
if k == "accesstime":
continue
if v == dct2[k]:
print("Match: {0} with {0}".format(k))
else:
print("No match: {0} with {0}".format(k))
Note that the values of your "accesstime" keys are not valid Python objects!

If you are happy to use a 3rd party library, this kind of task can be more efficiently implemented, and in a more structured way, via Pandas:
import pandas as pd
res = pd.merge(pd.DataFrame(current_list),
pd.DataFrame(backup_list),
on=['name', 'address', 'age'],
how='outer',
indicator=True)
print(res)
accesstime_x address age name accesstime_y _merge
0 11:14:01 Home 23 Bill 13:34:24 both
1 11:57:43 Home 26 Fred 13:34:26 both
2 11:24:14 Home 33 Nora 13:35:14 both
The result _merge = 'both' for each row indicates the combination of ['name', 'address', 'age'] occurs in both lists but, in addition, you get to see the accesstime from each input.

You can use zip method to iterate over lists simultaneously.
elements_to_compare = ["name", "age", "address"]
for dic1, dic2 in zip(current_list, backup_list):
for element in elements_to_compare :
if dic1[element] == dic2[element]:
print("Match {0} with {0}".format(element))

Python - append to dictionary by name with multilevels 1, 1.1, 1.1.1, 1.1.2 (hierarchical)

I use openpyxl to read data from excel files to provide a json file at the end. The problem is that I cannot figure out an algorithm to do a hierarchical organisation of the json (or python dictionary).
The data form is like the following:
The output should be like this:
{
'id' : '1',
'name' : 'first',
'value' : 10,
'children': [ {
'id' : '1.1',
'name' : 'ab',
'value': 25,
'children' : [
{
'id' : '1.1.1',
'name' : 'abc' ,
'value': 16,
'children' : []
}
]
},
{
'id' : '1.2',
...
]
}
Here is what I have come up with, but i can't go beyond '1.1' because '1.1.1' and '1.1.1.1' and so on will be at the same level as 1.1.
from openpyxl import load_workbook
import re
from json import dumps
wb = load_workbook('resources.xlsx')
sheet = wb.get_sheet_by_name(wb.get_sheet_names()[0])
resources = {}
prev_dict = {}
list_rows = [ row for row in sheet.rows ]
for nrow in range(list_rows.__len__()):
id = str(list_rows[nrow][0].value)
val = {
'id' : id,
'name' : list_rows[nrow][1].value ,
'value' : list_rows[nrow][2].value ,
'children' : []
}
if id[:-2] == str(list_rows[nrow-1][0].value):
prev_dict['children'].append(val)
else:
resources[nrow] = val
prev_dict = resources[nrow]
print dumps(resources)

You need to access your data by ID, so first step is to create a dictionary where the IDs are the keys. For easier data manipulation, string "1.2.3" is converted to ("1","2","3") tuple. (Lists are not allowed as dict keys). This makes the computation of a parent key very easy (key[:-1]).
With this preparation, we could simply populate the children list of each item's parent. But before doing that a special ROOT element needs to be added. It is the parent of top-level items.
That's all. The code is below.
Note #1: It expects that every item has a parent. That's why 1.2.2 was added to the test data. If it is not the case, handle the KeyError where noted.
Note #2: The result is a list.
import json
testdata="""
1 first 20
1.1 ab 25
1.1.1 abc 16
1.2 cb 18
1.2.1 cbd 16
1.2.1.1 xyz 19
1.2.2 NEW -1
1.2.2.1 poz 40
1.2.2.2 pos 98
2 second 90
2.1 ezr 99
"""
datalist = [line.split() for line in testdata.split('\n') if line]
datadict = {tuple(item[0].split('.')): {
'id': item[0],
'name': item[1],
'value': item[2],
'children': []}
for item in datalist}
ROOT = ()
datadict[ROOT] = {'children': []}
for key, value in datadict.items():
if key != ROOT:
datadict[key[:-1]]['children'].append(value)
# KeyError = parent does not exist
result = datadict[ROOT]['children']
print(json.dumps(result, indent=4))

Python how to compare string in a list with dict

I have a code like this:
It will print Student
d= u'pen hahahahaha'
area = [u'Apple',u'Banana',u'lemon']
area2 = [ u'pen',u'book',u'chair' ]
area3 = [u'father',u'mother']
if any(d.startswith(i) for i in area):
category = 'Fruit'
print 'Fruit'
elif any(d.startswith(i) for i in area2):
category = 'Student'
print 'Student'
elif any(d.startswith(i) for i in area3):
category = 'family'
print 'family'
I want to know how to edit it to a mode like this:
aa = [{"Fruit":[u'Apple',u'Banana',u'lemon']},
{"Student":[ u'pen',u'book',u'chair' ]},
{"Family":[u'father',u'mother']}]
So I can compare if 'pen hahahahaha' in {"Student":[ u'pen',u'book',u'chair' ]}
save category = 'Student'
I think for a while but have no idea,please guide me.Thank you

You can use loop:
categories = {
"Fruit": [u'Apple', u'Banana', u'lemon'],
"Student": [u'pen', u'book', u'chair'],
"Family": [u'father', u'mother']
}
def get_category(value):
for cat, cat_entries in categories.iteritems():
for cat_entry in cat_entries:
if value.startswith(cat_entry):
return cat
return None
print get_category('pen hahahahaha')
Output:
Student

Make aa a dictionary like:
aa = {"Fruit":[u'Apple',u'Banana',u'lemon'],
"Student":[ u'pen',u'book',u'chair' ],
"Family":[u'father',u'mother']}
obj = 'pen'
for key in aa:
if obj in aa[key]:
print(obj + ' is in ' + key)
edit:
May be this will suit your requirement more
aa = {"Fruit":[u'Apple',u'Banana',u'lemon'],
"Student":[ u'pen',u'book',u'chair' ],
"Family":[u'father',u'mother']}
obj = u'pen hahhah'
for key in aa:
for item in aa[key]:
if obj.startswith(item):
print(obj + ' is in ' + key)

aa = [{"Fruit":[u'Apple',u'Banana',u'lemon']},
{"Student":[ u'pen',u'book',u'chair' ]},
{"Family":[u'father',u'mother']}]
d=u'pen haaaaaa'
print [ x.keys()[0] for x in aa for y in x.values()[0] if y in d.split() ]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Count and remove duplicates in keys while preserving values - python

Related

String content become random integer after using append()

How can I print hello and all female members

compare two lists of dictionaries for specific fields

Python - append to dictionary by name with multilevels 1, 1.1, 1.1.1, 1.1.2 (hierarchical)

Python how to compare string in a list with dict

Categories

Resources