Creating a dictionary from a text file Python - python

I have to create a dictionary based on a csv file that looks like this:
'song, 2000, 184950'
'boom, 2009, 83729'
'boom, 2010, 284500'
'boom, 2011, 203889'
'pow, 2000, 385920'
'pow, 2001, 248930'
from this, I have to create a dictionary that consists of the word as a key and then a list of class objects as a value.
This is what I have so far...
class Counter():
__slots__ = ('year', 'count')
_types = (int, int)
def readfile(file):
d = dict()
with open(file) as f:
for line in f:
element = line.split(,)
for word in element:
if word in d:
d[word].append([Count(int(element[1]), int(element[2]))])
else:
d[word] = [Count(int(element[1]), int(element[2]))]
print(d)
the output I'm getting is weird and it is giving me a dictionary similar to what mine should look like but it's using the counts (183930) as the key instead of the name. I also need it to add the class onto the value if it is already listed in the dictionary.
for example, since 'boom' should already be in the dictionary with {'boom' : Count(year = 2009, count = 83729)} I want there to be a list of those Count objects under the one value.
expected output:
{'song' : [Count(year= 2000, count= 184950)], 'boom' : [Count(year=2009, count=83729),
Count(year=2010, count= 284500), Count(year=2011, count=203889)], 'pow' : ...etc..}

With this loop:
for word in element:
if word in d:
d[word].append([Count(int(element[1]), int(element[2]))])
else:
d[word] = [Count(int(element[1]), int(element[2]))]
You are iterating trough all words on line, so you call (for first line):
d['song'].append([Count(int('2000'), int('184950'))])
d['2000'].append([Count(int('2000'), int('184950'))])
d['184950'].append([Count(int('2000'), int('184950'))])
Just use:
for line in f:
element = line.split(,)
word = element[0]
if word in d:
d[word].append(Count(int(element[1]), int(element[2])))
else:
d[word] = [Count(int(element[1]), int(element[2]))]
You also can replace your condition if word in d if you use collections.defaultdict:
import collections
def readfile(file):
d = collections.defaultdict(list)
with open(file) as f:
for line in f:
element = line.split(,)
word = element[0]
d[word].append(Count(int(element[1]), int(element[2])))
print(d)

Here's some simple code that does what you're looking for.
from collections import defaultdict
from pprint import pprint
class Counter(object):
__slots__ = ('year', 'count')
def __init__(self, year, count):
self.year = year
self.count = count
def __str__(self):
return "Counter(year=%d, count=%d)" % (self.year, self.count)
def __repr__(self):
return self.__str__()
def import_counts():
counts = defaultdict(list)
with file('data.csv') as f:
for line in f:
name, year, count = line.split(',')
name, year, count = name.strip(), int(year.strip()), int(count.strip())
counts[name].append(Counter(year, count))
return counts
pprint(import_counts())
However, I changed the data format into a proper CSV as below.
song, 2000, 184950
boom, 2009, 83729
boom, 2010, 284500
boom, 2011, 203889
pow, 2000, 385920
pow, 2001, 248930
The output generated is as below:
{
'boom': [
Counter(year=2009, count=83729),
Counter(year=2010, count=284500),
Counter(year=2011, count=203889)
],
'pow': [
Counter(year=2000, count=385920),
Counter(year=2001, count=248930)
],
'song': [
Counter(year=2000, count=184950)
]
}
Do note that the above doesn't validate input and would error if given an invalid CSV.

Related

How to get rid of the rest of the text after getting the results I want?

import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
coauthors_dict = {}
url_str = ('https://api.semanticscholar.org/graph/v1/author/47490276?fields=name,papers.authors')
respons = urllib.request.urlopen(url_str)
text = respons.read().decode()
for line in respons:
print(line.decode().rstip())
data = json.loads(text)
print(type(data))
print(list(data.keys()))
print(data["name"])
print(data["authorId"])
name = []
for lines in data["papers"]:
for authors in lines["authors"]:
name.append(authors.get("name"))
print(name)
count = dict()
names = name
for i in names:
if i not in count:
count[i] = 1
else:
count[i] += 1
print(count)
c = Counter(count)
top = c.most_common(10)
print(top)
return coauthors_dict
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc.items(), key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
This is how my code looks this far, there are no error. I need to get rid of the rest of the text when I run it, so it should look like this:
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
I have tried using rstrip and split on my 'c' variable but it doesn't work. Im only allowed importing what I already have imported and must use the link which is included.
Tips on simplifying or bettering the code is also appreciated!
("Extend the program below so that it prints the names of the top-10 coauthors together with the numbers of the coauthored publications")
From what I understand you are not quite sure where your successful output originates from. It is not the 5 lines at the end.
Your result is printed by the print(top) on line 39. This top variable is what you want to return from the function, as the coauthors_dict you are currently returning never actually gets any data written to it.
You will also have to slightly adjust your sorted(...) as you now have a list and not a dictionary, but you should then get the correct result.
If I understand correctly you are wanting this function to return a count of each distinct co-author (excluding the author), which it seems like you already have in your count variable, which you don't return. The variable you DO return is empty.
Instead consider:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
#The statement above can be written long-hand like:
#names=[]
#for l in data["papers"]:
# for a in l["authors"]:
# if a['authorId'] != author_id:
# names.append(a.get("name"))
return list(Counter(names).items())
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc, key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
You might also consider moving the top N logic into the function as an optional paramter:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id, top=0):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
name_count = list(Counter(names).items())
top = top if top!=0 else len(name_count)
return sorted(name_count, key=lambda x: x[1], reverse=True)[:top]
author_id = '47490276'
for auth in count_coauthors(author_id, top=10):
print(auth)

How to find all longest common substrings that exist in multiple documents?

I have many text documents that I want to compare to one another and remove all text that is exactly the same between them. This is to remove find boiler plate text that is consistent so it can be removed for NLP.
The best way I figured to do this is to find Longest Common Sub-strings that exist or are mostly present in all the documents. However, doing this has been incredibly slow.
Here is an example of what I am trying to accomplish:
DocA:
Title: To Kill a Mocking Bird
Author: Harper Lee
Published: July 11, 1960
DocB:
Title: 1984
Author: George Orwell
Published: June 1949
DocC:
Title: The Great Gatsby
Author: F. Scott Fitzgerald
The output would show something like:
{
'Title': 3,
'Author': 3,
'Published': 2,
}
The results would then be used to strip out the commonalities between documents.
Here is some code I have tested in python. It's incredibly with any significant amount of permutations:
file_perms = list(itertools.permutations(files, 2))
results = {}
for p in file_perms:
doc_a = p[0]
doc_b = p[1]
while True:
seq_match = SequenceMatcher(a=doc_a, b=doc_b)
match = seq_match.find_longest_match(0, len(doc_a), 0, len(doc_b))
if (match.size >= 5):
doc_a_start, doc_a_stop = match.a, match.a + match.size
doc_b_start, doc_b_stop = match.b, match.b + match.size
match_word = doc_a[doc_a_start:doc_a_stop]
if match_word in results:
results[match_word] += 1
else:
results[match_word] = 1
doc_a = doc_a[:doc_a_start] + doc_a[doc_a_stop:]
doc_b = doc_b[:doc_b_start] + doc_b[doc_b_stop:]
else:
break
df = pd.DataFrame(
{
'Value': [x for x in results.keys()],
'Count': [x for x in results.values()]
}
)
print(df)
create a set from each document,
build a counter for every word how many time it appears
iterate over every document, when you find a word that appears in 70% -90% of documents,
append it and the word after it as a tuple to a new counter
and again..
from collections import Counter
one_word = Counter()
for doc in docs:
word_list = docs.split(" ")
word_set = set(word_list)
for word in word_set:
one_word[word]+=1
two_word = Counter()
threshold = len(docs)*0.7
for doc in docs:
word_list = doc.split(" ")
for i in range(len(word_list)-1):
if one_word[word_list[i]]>threshold:
key = (word_list[i], word_list[i+1])
you can play with the threshold and continue as long as the counter is not empty
the docs are lyrics of songs believer, by the river of Babylon, I could stay awake, rattlin bog
from collections import Counter
import os
import glob
TR =1 #threshold
dir = r"D:\docs"
path = os.path.join(dir,"*.txt")
files = glob.glob(path)
one_word = {}
all_docs = {}
for file in files:
one_word[file] = set()
all_docs[file] = []
with open(file) as doc:
for row in doc:
for word in row.split():
one_word[file].add(word)
all_docs[file].append(word)
#now one_word is a dict where the kay is file name and the value is set of words in it
#all_docs is a dict file name is the key and the value is the complete doc stord in a list word by word
common_Frase = Counter()
for key in one_word:
for word in one_word[key]:
common_Frase[word]+=1
#common_Frase containe a count of all words appearence in all files (every file can add a word once)
two_word = {}
for key in all_docs:
two_word[key] = set()
doc = all_docs[key]
for index in range(len(doc)-1):
if common_Frase[doc[index]]>TR:
val = (doc[index], doc[index+1])
two_word[key].add(val)
for key in two_word:
for word in two_word[key]:
common_Frase[word]+=1
#now common_Frase contain a count of all two words frase
three_word = {}
for key in all_docs:
three_word[key] = set()
doc = all_docs[key]
for index in range(len(doc)-2):
val2 = (doc[index], doc[index+1])
if common_Frase[val2]>TR:
val3 = (doc[index], doc[index+1], doc[index+2])
three_word[key].add(val3)
for key in three_word:
for word in three_word[key]:
common_Frase[word]+=1
for k in common_Frase:
if common_Frase[k]>1:
print(k)
this is the outpot
when like all Don't And one the my hear and feeling Then your of I'm in me The you away I never to be what a ever thing there from By down Now words that was ('all', 'the') ('And', 'the') ('the', 'words') ('By', 'the') ('and', 'the') ('in', 'the')

'itertools._grouper' object has no attribute 'user'

Why can't I convert the loop group in groupby as list? Currently, I am working on Django==2.2.1 and when I try this data = [...] below into python console, it is working fine.
from itertools import groupby
from operator import itemgetter
#login_required
def list(request, template_name='cart/list.html'):
# I also try with this dummy data
test_data = [{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':7.0}]
print(type(data)) # a list
sorted_totals = sorted(test_data, key=itemgetter('total_order'))
for agent_name, group in groupby(sorted_totals, key=lambda x: x['agent_name']):
print(agent_name, list(group)) # I stopped here when converting the `group` as list.
But, I am getting an error looking like this when I try it at views in Django.
I also tried it with defaultdict
from collections import defaultdict
#login_required
def list(request, template_name='cart/list.html'):
test_data = [{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':7.0}]
grouped = defaultdict(list)
for data_total in test_data:
grouped[data_total['agent_name']].append(data_total) # stoped here
grouped_out = []
for agent_name, group in grouped.items():
total_order = 0
total_pcs = 0
total_kg = 0
if isinstance(group, list):
for data_total in group:
total_order += data_total.get('total_order')
total_pcs += data_total.get('total_pcs')
total_kg += data_total.get('total_kg')
grouped_out.append({
'agent_name': agent_name,
'total_order': total_order,
'total_pcs': total_pcs,
'total_kg': total_kg
})
But the error I found stoped by wrapper view. If we following the previous issue, it referenced with this _wrapped_view
Finally, I fixed it manually by using a dict.
test_data = [{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':5.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agentbeli','total_pcs':1,'total_kg':6.0},{'total_order':1,'agent_name':'agent123','total_pcs':1,'total_kg':7.0}]
grouped = {}
for data_total in test_data:
agent_name = data_total.get('agent_name')
if agent_name in grouped:
new_data = grouped[agent_name] # dict
new_data['total_order'] += data_total.get('total_order')
new_data['total_pcs'] += data_total.get('total_pcs')
new_data['total_kg'] += data_total.get('total_kg')
grouped[agent_name].update(**new_data)
else:
grouped[agent_name] = data_total
And the result of grouped is look like this:
{'agent123': {'agent_name': 'agent123',
'total_kg': 18.0,
'total_order': 3,
'total_pcs': 3},
'agentbeli': {'agent_name': 'agentbeli',
'total_kg': 17.0,
'total_order': 3,
'total_pcs': 3}}

How can I count different values per same key with Python?

I have a code which is able to give me the list like this:
Name id number week number
Piata 4 6
Mali 2 20,5
Goerge 5 4
Gooki 3 24,64,6
Mali 5 45,9
Piata 6 1
Piata 12 2,7,8,27,16 etc..
with the below code:
import csv
from datetime import date
datedict = defaultdict(set)
with open('d:/info.csv', 'r') as csvfile:
filereader = csv.reader(csvfile, 'excel')
#passing the header
read_header = False
start_date=date(year=2009,month=1,day=1)
#print((seen_date - start_date).days)
tdic = {}
for row in filereader:
if not read_header:
read_header = True
continue
# reading the rest rows
name,id,firstseen = row[0],row[1],row[3]
try:
seen_date = datetime.datetime.strptime(firstseen, '%d/%m/%Y').date()
deltadays = (seen_date-start_date).days
deltaweeks = deltadays/7 + 1
key = name,id
currentvalue = tdic.get(key, set())
currentvalue.add(deltaweeks)
tdic[key] = currentvalue
except ValueError:
print('Date value error')
pass
Right now I want to convert my list to a list that give me number of ids for each name and its weeks numbers like the below list:
Name number of ids weeknumbers
Mali 2 20,5,45,9
Piata 3 1,6,2,7,8,27,16
Goerge 1 4
Gooki 1 24,64,6
Can anyone help me with writing the code for this part?
Since it looks like your csv file has headers (which you are currently ignoring) why not use a DictReader instead of the standard reader class? If you don't supply fieldnames the DictReader will assume the first line contains them, which will also save you from having to skip the first line in your loop.
This seems like a great opportunity to use defaultdict and Counter from the collections module.
import csv
from datetime import date
from collections import defaultdict, Counter
datedict = defaultdict(set)
namecounter = Counter()
with open('d:/info.csv', 'r') as csvfile:
filereader = csv.DictReader(csvfile)
start_date=date(year=2009,month=1,day=1)
for row in filereader:
name,id,firstseen = row['name'], row['id'], row['firstseen']
try:
seen_date = datetime.datetime.strptime(firstseen, '%d/%m/%Y').date()
except ValueError:
print('Date value error')
pass
deltadays = (seen_date-start_date).days
deltaweeks = deltadays/7 + 1
datedict[name].add(deltaweeks)
namecounter.update([name]) # Without putting name into a list, update will index each character
This assumes that (name, id) is unique. If this is not the case then you can use anotherdefaultdict for namecounter. I've also moved the try-except statement so it is more explicit in what you are testing.
givent that :
tdict = {('Mali', 5): set([9, 45]), ('Gooki', 3): set([24, 64, 6]), ('Goerge', 5): set([4]), ('Mali', 2): set([20, 5]), ('Piata', 4): set([4]), ('Piata', 6): set([1]), ('Piata', 12): set([8, 16, 2, 27, 7])}
then to output the result above:
names = {}
for ((name, id), more_weeks) in tdict.items():
(ids, weeks) = names.get(name, (0, set()))
ids = ids + 1
weeks = weeks.union(more_weeks)
names[name] = (ids, weeks)
for (name, (id, weeks)) in names.items():
print("%s, %s, %s" % (name, id, weeks)

load parameters from a file in Python

I am writing a Python class to model a process and I want to initialized the parameters from a file, say 'input.dat'. The format of the input file looks like this.
'input.dat' file:
Z0: 0 0
k: 0.1
g: 1
Delta: 20
t_end: 300
The code I wrote is the following. It works but appears redundant and inflexible. Is there a better way to do the job? Such as a loop to do readline() and then match the keyword?
def load(self,filename="input.dat"):
FILE = open(filename)
s = FILE.readline().split()
if len(s) is 3:
self.z0 = [float(s[1]),float(s[2])] # initial state
s = FILE.readline().split()
if len(s) is 2:
self.k = float(s[1]) # kappa
s = FILE.readline().split()
if len(s) is 2:
self.g = float(s[1])
s = FILE.readline().split()
if len(s) is 2:
self.D = float(s[1]) # Delta
s = FILE.readline().split()
if len(s) is 2:
self.T = float(s[1]) # end time
Assuming the params are coming from a safe place (made by you or users, not the internet), just make the parameters file a Python file, params.py:
Z0 = (0, 0)
k = 0.1
g = 1
Delta = 20
t_end = 300
Then in your code all you need is:
import params
fancy_calculation(10, k=params.k, delta=params.Delta)
The beauty of this is two-fold: 1) simplicity, and 2) you can use the power of Python in your parameter descriptions -- particularly useful here, for example:
k = 0.1
Delta = 20
g = 3 * k + Delta
Alternatively, you could use Python's built-in JSON or ConfigParser .INI parser modules.
If you are open to some other kind of file where you can keep your parameters, I would suggest you to use a YAML file.
The Python library is PyYAML. This is how you can easily use it with Python.
For a better introduction, look at this Wikipedia article: http://en.wikipedia.org/wiki/YAML.
The benefit is you can read the parameter values as lists or maps.
You would love it!
Try the following:
def load(self, filename="input.dat"):
d = {"Z0": "z0", "k": "k", "g": "g", "Delta": "D", "t_end": "T"}
FILE = open(filename)
for line in FILE:
name, value = line.split(":")
value = value.strip()
if " " in value:
value = map(float, value.split())
else:
value = float(value)
setattr(self, d[name], value)
Proof that it works:
>>> class A(object): pass
...
>>> a = A()
>>> load(a)
>>> a.__dict__
{'k': 0.10000000000000001, 'z0': [0.0, 0.0], 'D': 20.0, 'g': 1.0, 'T': 300.0}
As others have mentioned, in Python you can create object attributes dynamically "on the fly". That means you could do something like the following to create Params objects as they're read-in. I've tried to make the code as data-driven as possible, so relatively flexible.
# maps label to attribute name and types
label_attr_map = {
"Z0:": ["z0", float, float],
"k:": [ "k", float],
"g:": [ "g", float],
"Delta:": [ "D", float],
"t_end:": [ "T", float]
}
class Params(object):
def __init__(self, input_file_name):
with open(input_file_name, 'r') as input_file:
for line in input_file:
row = line.split()
label = row[0]
data = row[1:] # rest of row is data list
attr = label_attr_map[label][0]
datatypes = label_attr_map[label][1:]
values = [(datatypes[i](data[i])) for i in range(len(data))]
self.__dict__[attr] = values if len(values) > 1 else values[0]
params = Params('input.dat')
print 'params.z0:', params.z0
print 'params.k:', params.k
print 'params.g:', params.g
print 'params.D:', params.D
print 'params.T:', params.T
Output:
params.z0: [0.0, 0.0]
params.k: 0.1
params.g: 1.0
params.D: 20.0
params.T: 300.0
Perhaps this might give you what you need:
def load(self,filename='input.dat'):
with open(filename) as fh:
for line in fh:
s = line.split()
if len(s) == 2:
setattr(self,s[1],s[2])
elif len(s) == 3:
setattr(self,s[1],s[2:])
I also didn't include any error checking, but setattr is very handy.
Something like this:
def load(self,filename="input.dat"):
# maps names to number of fields they need
# only necessary for variables with more than 1 field
argmap = dict(Z0=2)
# maps config file names to their attribute names on the object
# if name is the same both places, no need
namemap = dict(Z0="z0", Delta="D", t_end="T")
with open(filename) as FILE:
for line in FILE:
s = line.split()
var = s[0].rstrip(":")
try:
val = [float(x) for x in s[1:]]
except ValueError:
continue
if len(val) == varmap.get(var, 1):
if len(val) == 1:
val = val[0]
setattr(self, namemap.get(var, var), val)
Python objects have a built-in __dict__ member. You can modify it, and then refer to properties as obj.key.
class Data(object):
def __init__(self, path='infile.dat'):
with open(path, 'r') as fo:
for line in fo.readlines():
if len(line) < 2: continue
parts = [s.strip(' :\n') for s in line.split(' ', 1)]
numbers = [float(s) for s in parts[1].split()]
# This is optional... do you want single values to be stored in lists?
if len(numbers) == 1: numbers = numbers[0]
self.__dict__[parts[0]] = numbers
# print parts -- debug
obj = Data('infile.dat')
print obj.g
print obj.Delta
print obj.Z0
At the end of this, we print out a few of the keys. Here's the output of those.
1.0
20.0
[0.0, 0.0]
For consistency, you can remove the line marked "optional" in my code, and have all objects in lists -- regardless of how many elements they have. That will make using them quite a bit easier, because you never have to worry about obj.g[0] returning an error.
Here's another one
def splitstrip(s):
return s.split(':')[1].strip()
with open('input.dat','r') as f:
a.z0 = [float(x) for x in splitstrip(f.readline()).split(' ')]
a.k, a.g, a.D, a.T = tuple([float(splitstrip(x)) for x in f.read().rstrip().split('\n')])
;)

Categories

Resources