Splitting a file into a dictionary in Python - python

I'm only a beginner python user so my apologies if this is a rather simple question. I have a file containing two lists divided by a tab. I would like to store this in a dictionary, so each entry is associated with the corresponding entry after the tab, such that:
cat hat
mouse bowl
rat nose
monkey uniform
dog whiskers
elephant dance
would be divided into
{'cat'; 'hat', 'mouse' ; 'bowl') etc. etc.
It's a very long list.
This is what I tried:
enhancerTAD = open('TAD_to_enhancer.map', 'r')
list = enhancerTAD.split()
for entry in list:
key, val = entry.split('\t')
ET[key] = val
print ET
Here's my most recent attempt, and the error message that I get below:
enhancerTAD = open('TAD_to_enhancer.map', 'r').read()
ET = {}
lst = enhancerTAD.split("\n")
for entry in lst:
key, val = entry.strip().split(' ',1)
ET[key] = val
enhancergene = open('enhancer_to_gene_map.txt', 'r').read()
GE = {}
lst1 = enhancergene.split("\n")
for entry in lst1:
key, val = entry.strip().split(' ',1)
GE[key] = val
geneTAD = open('TAD_to_gene_map.txt', 'r').read()
GT = {}
lst2 = geneTAD.split("\n")
for entry in lst2:
key, val = entry.strip().split(' ',1)
GT[key] = val
File "enhancertadmaybe.py", line 13, in
key, val = entry.strip().split(' ',1)
ValueError: need more than 1 value to unpack

You can try:
with open('foo.txt', 'r') as f:
print dict(line.strip().split('\t', 1) for line in f)
Result:
{'monkey': 'uniform', 'dog': 'whiskers', 'cat': 'hat', 'rat': 'nose', 'elephant': 'dance', 'mouse': 'bowl'}

Modification to your original method :
enhancerTAD = open('TAD_to_enhancer.map', 'r').read()
ET={}
lst = enhancerTAD.split("\n")
for entry in lst:
key, val = entry.strip().split('\t',1)
ET[key] = val
print ET
Points:
1.Your original method failed because were trying to split on a file object not the file content
i.e)
a=open("amazon1.txt","r")
c=a.split()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'file' object has no attribute 'split'
2.You have to read the content of the file to split it
i.e.)
enhancerTAD =open("amazon1.txt","r").read()
3.Since you have key,value pair in each line you have to split at new line initially
4.Then you can iterate over the list and again split it at \t and form the dictionary
Juniorcomposer method does all of this two line code and is more pythonic

Related

Read dictionary from file into original lists

I have one nested list, and one list for "numbers"
test_keys = [["tobbe", "kalle"],["karl", "Clara"],["tobbe"],["tank"]]
test_values = ['123', '234','345','456']
res = {}
for key in test_keys:
for value in test_values:
res[value] = key
test_values.remove(value)
break
with open("myfile.txt", 'w') as f:
for key, value in res.items():
f.write('%s;%s;\n' % (key, value))
This provides the file
123;['tobbe', 'kalle'];
234;['karl', 'Clara'];
345;['finis'];
456;['tank'];
now I want to load the data back into the a dictionary without the ";" and later on back into the corresponding lists.
Try this:
res = {}
with open("myfile.txt") as file:
for line in file:
chunks = line.split(';')
names = chunks[1][1:-1].split(', ')
res[chunks[0]] = [name[1:-1] for name in names]
print(res)
test_keys = []
test_values = []
for key in res:
test_keys.append(key)
test_values.append(res[key])
print(test_keys)
print(test_values)

Creating dictionary with duplicate keys from file, having trouble storing list to the key (python 3)

here is my txt file that has contained all of the lines. What I want to do is create a dictionary, and access a key, and get a list of values
Finance:JPMC
Software:Microsoft
Conglomerate:L&T
Conglomerate:Amazon
Software:Palantir
Defense:BAE
Defense:Lockheed
Software:TCS
Retail:TjMax
Retail:Target
Oil:Exxon
Oil:Chevron
Oil:BP
Oil:Gulf
Finance:Square
FMCG:PnG
FMCG:JohnsonNJohnson
FMCG:Nestle
Retail:Sears
Retail:FiveBelow
Defense:Boeing
Finance:Citadel
Finance:BridgeWater
Conglomerate:GE
Conglomerate:HoneyWell
Oil:ONGC
FMCG:Unilever
Semiconductor:Intel
Semiconductor:Nvidia
Semiconductor:Qualcomm
Semiconductor:Microchip
Conglomerate:Samsung
Conglomerate:LG
Finance:BoA
Finance:Discover
Software:TCS
Defense:Raytheon
Semiconductor:Microsemi
Defense:BAE
Software:Meta
Oil:SinoPec
Defense:Saab
Defense:Dassault
Defense:Airbus
Software:Adobe
Semiconductor:TSMC
FMCG:CocoCola
FMCG:Pesico
Retail:Kohls
Here is my attempted code
f = open("companyList.txt", "r")
sector, company = [], []
for line in f:
first, second = line.split(":")
sector.append(first)
company.append(second)
dictionary = {}
for key in sector:
for element in company:
dictionary[sector].append(element)
print(dictionary)
Since there are multiple duplicate keys, I wanted to append a list to that particular key as python doesn't allow duplicate keys.
If i understand your question right you can do this:
from collections import defaultdict
dictionary = defaultdict(list)
for line in f:
first, second = line.split(":")
dictionary[first].append(second)
I think this is what you want:
pairs = {}
with open("tst.txt", "r") as f:
while True:
line = f.readline().strip()
if not line:
break
sector, value = line.split(":", 1)
if sector not in pairs:
pairs[sector] = []
pairs[sector].append(value)
f.close()
print(pairs)
you should do:
f = open("companyList.txt", "r")
sector, company = [], []
for line in f:
first, second = line.split(":")
sector.append(first)
company.append(second)
dictionary = {}
for sectory,companyy in zip(sector,company):
dictionary[sectory] = companyy
for key in sector:
dictionary[sector] = key

Add keys and values from a text file to a dictionary

I have a text file test.txt with two lines on it:
1 "test one"
2 "test two"
I just want to add those 2 lines to test_dict:
test_dict = {}
with open("test.txt", "r") as f:
for line in f:
(key, val) = line.split()
test_dict[int(key)] = val
print(test_dict)
Error:
ValueError: too many values to unpack (expected 2)
Expected result:
test_dict = {1: "test one", 2: "test two"}
After you edited your question, this should work :
test_dict = {}
with open("testtt.txt", "r") as f:
for line in f:
(key, val) = line.split(maxsplit=1)
test_dict[int(key)] = val
print(test_dict)
Because we need the first item as a key, and "rest of them" as value, so I pass maxsplit=1 as an argument to split(). This only split the line only 1 time from the left.

Unhashable type: list

I am working on a program that parses through log files and returns the top hits for IP addresses and a couple other things. Currently I am having trouble and I cannot interpret any of the answers to this problem to what I have going on right now. This is all of my code:
import gzip
from collections import Counter
logFileName = open('C:\\Users\\Pawlaczykm\\Desktop\\fileNames.txt', 'r')
ipAdd = []
landingPages = []
ALL_ipAdd = []
ALL_landingPages = []
# everything after this line gets done to all files
for line in logFileName.readlines():
# rstrip removes a blank line from output
# print 'Summary of: ' + line.rstrip()
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the ip addresses in lines 15-18
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
ipAdd.append(parts[2])
ALL_ipAdd.append(ipAdd)
# use gzip to decompress the file
with gzip.open('C:\\Users\\Pawlaczykm\\Desktop\\logFiles\\' + line.rstrip() + '.gz', 'rb') as f:
# we extract the landing pages
for eachLine in f:
parts = eachLine.split('\t')
if len(parts) > 1:
variable = parts[8].split('?')[0]
landingPages.append(variable)
v): (-v, k))[:10]
ALL_landingPages.append(landingPages)
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
sortedALL_ipAdd = sorted(ALL_ipAddDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top IPs of all files'
print(sortedALL_ipAdd)
ALL_LandingPageDict = dict(Counter(ALL_landingPages).most_common())
sortedALL_LandingPage = sorted(ALL_LandingPageDict.iteritems(), key=lambda (k, v): (-v, k))[:10]
print 'Top landing pages of all files'
print (sortedALL_LandingPage)
Now where I am having trouble is in the following line:
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
The output when I run the whole program is this:
Traceback (most recent call last):
File "C:/Users/Pawlaczykm/PycharmProjects/LogParse/parseText.py", line 35, in <module>
ALL_ipAddDict = dict(Counter(ALL_ipAdd).most_common())
File "C:\Python27\lib\collections.py", line 477, in __init__
self.update(*args, **kwds)
File "C:\Python27\lib\collections.py", line 567, in update
self[elem] = self_get(elem, 0) + 1
TypeError: unhashable type: 'list'
Can somebody help me? This is frustrating.
From your code ALL_ipAdd = [] and ipAdd = [] and ALL_ipAdd.append(ipAdd) we can conclude that ALL_ipAdd is a list of list. Counter is a subtype of dict, which hashes its items before it counts them. Lists cannot be hashed because they are mutable (if the list changed the hash would change) and thus lists can't be counted by Counter objects.
To solve this you can convert the inner lists to tuples before counting them:
ALL_ipAddDict = dict(Counter(map(tuple, ALL_ipAdd)).most_common())
That's normal. ALL_ipAdd is a list of lists. Counter needs a list, a string or any other hashable type :)

Finding a line in a file then reading next few lines in Python

I have a plain text file with the following data:
id=1
name=Scott
occupation=Truck driver
age=23
id=2
name=Dave
occupation=Waiter
age=16
id=3
name=Susan
occupation=Computer programmer
age=29
I'm trying to work out the best way to get to any point in the file given an id string, then grab the rows underneath to extract the data for use in my program. I can do something like:
def get_person_by_id(id):
file = open('rooms', 'r')
for line in file:
if ("id=" + id) in line:
print(id + " found")
But I'm not sure how I can now go through the next bunch of lines and do line.split("=") or similar to extract the info (put into a list or dict or whatever) that I can use my program. Any pointers?
One option would be to load the entire thing into memory, which would save you from reading the file every time:
with open('rooms') as f:
chunks = f.read().split('\n\n')
people_by_id = {}
for chunk in chunks:
data = dict(row.split('=', 1) for row in chunk.split('\n'))
people_by_id[data['id']] = data
del data['id']
def get_person_by_id(id):
return people_by_id.get(id)
How about exiting from a for loop after finding the correct line:
def get_person_by_id(id):
file = open('rooms', 'r')
for line in file:
if ("id=" + id) in line:
print(id + " found")
break
#now you can continue processing your file:
next_line = file.readline()
Maybe:
d = dict()
with open(filename) as f:
for line in f:
k,v = line.split('=')
if 'id=' in line:
d[v] = {}
d[d.keys()[-1]][k] = v
And here is an iterative solution.
objects = []
current_object = None
with open("info.txt", "rb") as f:
for line in f:
line = line.strip("\r\n")
if not line:
current_object = None
continue
if current_object is None:
current_object = {}
objects.append(current_object)
key,_,value = line.partition('=')
current_object[key] = value
print objects
Another example of an iterative parser:
from itertools import takewhile
def entries(f):
e = {}
def read_one():
one = {}
for line in takewhile(lambda x: '=' in x, f):
key, val = line.strip().split('=')
one[key] = val
return one
while True:
one = read_one()
if not one:
break
else:
e[one.pop('id')] = one
return e
Example:
>>> with open('data.txt') as f:
..: print entries(f)['2']
{'age': '16', 'occupation': 'Waiter', 'name': 'Dave'}
Get all the person's attributes and values (i.e. id, name, occupation, age, etc..), till you find
an empy line.
def get_person_by_id(id):
person = {}
file = open('rooms', 'r')
for line in file:
if found == True:
if line.strip():
attr, value = line.split("="):
else:
return person
elif ("id=" + id) in line:
print(id + " found")
found = True
attr, value = line.split("=")
person[attr] = value
return person
This solution is a bit more forgiving of empty lines within records.
def read_persons(it):
person = dict()
for l in it:
try:
k, v = l.strip('\n').split('=', 1)
except ValueError:
pass
else:
if k == 'id': # New record
if person:
yield person
person = dict()
person[k] = v
if person:
yield person

Categories

Resources