I have data that looks like this:
The columns are Name, ID, Dev ID, Date
('Anthony', '1', '10', '4/3/2017')
('Anthony', '1', '11', '5/2/2017')
('Anthony', '1', '13', '12/30/2017
('Anthony', '1', '15', '8/20/2017'
('Anthony', '4', '17', '2/3/2018')
('Anthony', '4', '18', '3/28/2017'
('Bob', '1', '111', '4/3/2017')
('Bob', '1', '200', '5/2/2017')
('Bob', '1', '113', '12/30/2017')
('Bob', '1', '115', '8/20/2017')
('Bob', '4', '117', '2/3/2018')
('Bob', '4', '118', '3/28/2017')
I'm trying to find unique Name's and ID's and then compare any dates and return only the one furthest in the future.
Ideally I want output that looks like:
('Anthony', '1', '12/30/2017')
('Anthony', '4', '2/3/2018')
('Bob', '1', '12/30/2017')
('Bob', '4', '2/3/2018')
I'm struggling because I have multiple keys and I can't figure out how to make it work. Any ideas?
Edit: This is only a sample I have 30ish people names and 10 unique id's. So i'm looking to make a For loop to figure this out.
You can use itertools.groupby combined with max to get output similar to what you're looking for.
import itertools
from datetime import datetime
data = [('Anthony', '1', '10', '4/3/2017'),
('Anthony', '1', '11', '5/2/2017'),
('Anthony', '1', '13', '12/30/2017'),
('Anthony', '1', '15', '8/20/2017'),
('Anthony', '4', '17', '2/3/2018'),
('Anthony', '4', '18', '3/28/2017'),
('Bob', '1', '111', '4/3/2017'),
('Bob', '1', '200', '5/2/2017'),
('Bob', '1', '113', '12/30/2017'),
('Bob', '1', '115', '8/20/2017'),
('Bob', '4', '117', '2/3/2018'),
('Bob', '4', '118', '3/28/2017')]
groups_with_max_date = []
for key, group in itertools.groupby(data, lambda d: (d[0], d[1])):
# convert to datetime and get max of group
group_max = max(group, key=lambda q: datetime.strptime(q[3], '%m/%d/%Y'))
groups_with_max_date.append(group_max)
groups_with_max_date
Gives us:
[('Anthony', '1', '13', '12/30/2017'),
('Anthony', '4', '17', '2/3/2018'),
('Bob', '1', '113', '12/30/2017'),
('Bob', '4', '117', '2/3/2018')]
The solution using datetime object, dict.setdefault(), max and datetime.strptime functions:
import datetime
l = [('Anthony', '1', '10', '4/3/2017'),('Anthony', '1', '11', '5/2/2017'),('Anthony', '1', '13', '12/30/2017'),('Anthony', '1', '15', '8/20/2017'),
('Anthony', '4', '17', '2/3/2018'),('Anthony', '4', '18', '3/28/2017'),('Bob', '1', '111', '4/3/2017'),('Bob', '1', '200', '5/2/2017'),
('Bob', '1', '113', '12/30/2017'),('Bob', '1', '115', '8/20/2017'),('Bob', '4', '117', '2/3/2018'),('Bob', '4', '118', '3/28/2017')]
d = {}
for t in l:
# grouping items by first two values of each tuple(accumulating `date` strings)
d.setdefault(t[0] +'-'+ t[1], []).append(t[3]) # first two values of a tuple are combined to be a "hash" key
# getting max date from the list of `datetime` objects
result = [(*k.split('-'), max(v, key=lambda dt: datetime.datetime.strptime(dt, '%m/%d/%Y'))) for k,v in sorted(d.items())]
print(result)
The output:
[('Anthony', '1', '12/30/2017'), ('Anthony', '4', '2/3/2018'), ('Bob', '1', '12/30/2017'), ('Bob', '4', '2/3/2018')]
Related
So I am getting some data of a spreadsheet using gspread and they all go into a list. But I would like to just get the second, third and fourth value of that list separately.
Is there any way I can do this is python 3?
The lists look like this:
['22-6-2020 15:54:53', '4', '5', '46'],
['22-6-2020 15:54:53', '5', '3', '67'],
['22-6-2020 15:54:53', '1', '7', '5'],
['22-6-2020 15:54:53', '3', '86', '67'],
['22-6-2020 15:54:53', '1', '26', '12']
You can do so using list comprehension
data = [['22-6-2020 15:54:53', '4', '5', '46'],
['22-6-2020 15:54:53', '5', '3', '67'],
['22-6-2020 15:54:53', '1', '7', '5'],
['22-6-2020 15:54:53', '3', '86', '67'],
['22-6-2020 15:54:53', '1', '26', '12']]
l1, l2, l3 = [[d[x] for d in data] for x in range(1,4)]
print(l1, l2, l3)
#prints
['4', '5', '1', '3', '1'] ['5', '3', '7', '86', '26'] ['46', '67', '5', '67', '12']
After reading from a file I have a list of lists contaning not only digits but also other characters, which I would like to get rid of.
I've tried using re.sub function but this doesn't seem to work
import re
Poly_id= [['0', '[4', '8', '18', '20', '5', '0', '4]'], ['1', '[13', '16',
'6', '11', '13]'], ['2', '[3', '1', '10', '9', '2', '15', '3]'], ['3',
'[13', '12', '16', '13]'], ['4', '[13', '11', '17', '14', '7', '13]']]
for x in Poly_id:
[re.sub(r'\W', '', ch) for ch in x]
This doesn't seem to change a thing in this list.
I would like to have a list with only numbers as elements so that I could convert them into integers
I guess technically [4 is non numeric so you can do something like this:
Poly_id = [[char for char in _list if str.isnumeric(char)] for _list in Poly_id]
Output:
['0', '8', '18', '20', '5', '0']
['1', '16', '6', '11']
['2', '1', '10', '9', '2', '15']
['3', '12', '16']
['4', '11', '17', '14', '7']
If you just want to remove the non numeric values and not the complete entry then you can do this:
Poly_id = [[''.join(char for char in substring if str.isnumeric(char)) for substring in _list] for _list in Poly_id]
Output:
['0', '4', '8', '18', '20', '5', '0', '4']
['1', '13', '16', '6', '11', '13']
['2', '3', '1', '10', '9', '2', '15', '3']
['3', '13', '12', '16', '13']
['4', '13', '11', '17', '14', '7', '13']
Here a solution if you want to get rid of the '[' in '[4' but keep the '4':
res = [[re.sub(r'\W', '', st) for st in inlist] for inlist in Poly_id]
res is:
[
['0', '4', '8', '18', '20', '5', '0', '4'],
['1', '13', '16', '6', '11', '13'],
['2', '3', '1', '10', '9', '2', '15', '3'],
['3', '13', '12', '16', '13'],
['4', '13', '11', '17', '14', '7', '13']
]
You can use a module, "itertools"
import itertools
list_of_lists = [[1, 2], [3, 4]]
print(list(itertools.chain(*list_of_lists)))
>>>[1, 2, 3, 4]
I am using tflearn and I am using the following code to load my csv file...
data, labels = load_csv('/home/eric/Documents/Speed Dating Data.csv',
target_column=0, categorical_labels=False)
Here is a snippet of my csv file (there are a lot more columns)...
I want to remove a specific column. For example, let's say I remove column 1 and then print out the data for column 1 to 5...
def preprocess(cols_del):
data, labels = load_csv('/home/eric/Documents/Speed Dating Data.csv',
target_column=0, categorical_labels=False)
for col_del in sorted(cols_del):
[data.pop(col_del) for position in data]
for i in range(20):
print(data[i][0:5])
def main(_):
delete = [0]
preprocess(delete)
This is the result...
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['9', '1', '18', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
['10', '1', '20', '2', '11']
The data is clearly different. What is going on? Are rows being deleted instead of column? How can I delete the entire column completely without altering any other columns?
Also, I know it is kind of a separate question, but if I were to use n_classes in my load csv function, how would I do that? Is that the number of column in my CSV?
What's happening is that the line [data.pop(col_del) for position in data] is deleting half your rows, and then you're displaying the first 20 rows of what's left. (It would delete all the rows, but the call to pop is advancing the loop iterator.)
If you don't want certain columns you should pass your delete list to the columns_to_ignore parameter when you call load_csv. See the function description at load_csv. If you need to remove columns from a dataset in memory I think it would be worth your time to learn the basics of the Pandas library; it will make your life much simpler.
You would need n_classes if your target labels were categorical, in order to tell load_csv how many categories there are. Since you have categorical_labels=False, you shouldn't need it.
I have 2 lists that I combined their elements into a dictionary. Both lists have size of 20, and the created dictionary is almost always 13-15. The code is:
nodes = []
for x in range(1,21):
nodes.append(str(random.randint(1,20)))
print(nodes)
Output:
['3', '6', '10', '12', '12', '10', '11', '17', '6', '19', '20', '19', '7', '16', '9', '13', '15', '9', '12', '5']
The rest of the code:
lines=[]
fp = open("work.txt") # open file on read mode
lines = fp.read().split("\n") # create a list containing all lines
fp.close() # close file
print(lines)
Output:
['5', '1', '7', '1', '1', '3', '12', '1', '1', '8', '7', '5', '12', '5', '5', '3', '7', '7', '13', '1', '']
To make a dictionary:
dictionary = dict(zip(nodes,lines))
print(dictionary)
{'3': '1', '6': '5', '10': '12', '12': '13', '10': '12', '11': '7', '17': '7', '6': '5', '19': '3', '20': '1', '19': '1', '17': '3', '16': '5', '9': '7'}
As you see, the size is got smaller to 14 when zipped. Do you know what is the reason and how can I fix it?
NOTE: I have to keep all the keys. The duplicates in the lists are on purpose.
NOTE2: The output should have the format above, the pairs of two numbers. (1,3) or (1:3) but not (1:3,4).
For a node,line dictionary, you need unique keys. Creating a dict from non-unique keys overwrites the first ones at some point.
If you really want non-unique indexes, just store your data in a list of tuples. You won't get quick access but that will work:
tuples = list(zip(nodes,lines))
Now if you want to gather the lines under the same node index, you can create a dictionary of lists of lines using for instance defaultdict:
import collections
d = collections.defaultdict(list)
for node,text in zip(nodes,lines):
d[node].append(text)
so for each unique node, you get a list of associated lines.
Because you can't have duplicate key names in a dictionary
>>> nodes = ['3', '6', '10', '12', '12', '10', '11', '17', '6', '19', '20', '19', '7', '16', '9', '13', '15', '9', '12', '5']
>>> lines = ['5', '1', '7', '1', '1', '3', '12', '1', '1', '8', '7', '5', '12', '5', '5', '3', '7', '7', '13', '1', '']
>>> dictionary = dict(zip(nodes,lines))
>>> len(dictionary.keys())
14
There are only 14 unique elemens in nodes, which are your dictionary keys
>>> len(set(nodes))
14
Your "key" list has duplicate items. Dictionaries require unique keys, previously entered keys are overwritten.
You can group items in a list as follows
result = {}
for key, item in zip(x, y):
if key in result:
result[key].append(item)
else:
result[key] = [item]
The problem is that you don't want a dictionary to begin with you want a list of tuples. Your code should instead be:
nodes = []
for x in range(1,21):
nodes.append(str(random.randint(1,20)))
lines = []
fp = open("work.txt")
lines = fp.read().split("\n")
fp.close()
zipped = list(zip(nodes, lines))
This will give you a list of tuples with the first being the random number and the second being the read number and will not eliminate duplicates. It will instead print out in the format of [(3,5), (6,1), ...] rather than what it originally looked like though but should accomplish what I think you originally intended.
Illinois: ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3']
Indiana: ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']
Those are in my dictionary as d.
How would I get the largest and smallest value in each key in the dictionary and get the index where's the value is.
For example:
In Illinois, 26 is the largest value which is index 5 and 3 is the smallest value which is index 15.
in Indiana: 13 is largest value which is index 7 and 2 is the smallest value which is index 14
The output:
Illinois: 26 in index 5 and 3 in index 15
Indiana: 13 in index 7 and 2 in index 14
How would I do this?
d = {}
for row in csv_f:
d[row[0]]=row[1:]
You can get the max and mins printed out as your string is like this:
(assuming you only want the first occurrence)
MY_D = {'Illinois': ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3'],
'Indiana': ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']}
for k,v in MY_D.items():
#This assumes that everything in v is an int, or rather can be converted to one.
my_l = [int(n) for n in v]
#if not
#my_l = [int(n) for n in v if n.isdigit()]
_max, _min = max(my_l), min(my_l)
print("%s: Min - %d in index %d, Max - %d in index %d" % (k, _min, my_l.index(_min), _max, my_l.index(_max)))
Here is a solution returning a dict {country: (maxval, index), (minval, index))}:
d = {
'Illinois': ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3'],
'Indiana': ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']
}
maxmin = {}
for state, numbers in d.items():
maxmin[state] = (
max(enumerate(numbers), key=lambda x: int(x[1])),
min(enumerate(numbers), key=lambda x: int(x[1]))
)
print(maxmin)
Bit thrown together, but seems to do the job.
d = {"Illinois": ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3'],
"Indiana": ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2']}
if __name__ == "__main__":
print d
for state in d:
# returns the numbers with their index (#, index)
pairs = [(int(d[state][x]), x) for x in xrange(len(d[state]))]
minpair = min(pairs)
maxpair = max(pairs)
print "%s: %d in index %d and %d in index %d"%(state,maxpair[0],maxpair[1],
minpair[0],minpair[1])
Output:
{'Indiana': ['7', '6', '7', '8', '11', '11', '13', '12', '7', '7', '7', '7', '9', '2', '2'], 'Illinois': ['13', '12', '18', '23', '26', '25', '24', '19', '13', '10', '15', '14', '14', '4', '3']}
Indiana: 13 in index 6 and 2 in index 13
Illinois: 26 in index 4 and 3 in index 14
to get around the blank string, you could break up the list comprehension into
pairs = []
for x in xrange(len(d[state])):
try:
pairs.append( (int(d[state][x]), x) )
except ValueError:
pass # not a valid number