Use python to extract lines that contain different keywords into dictionaries - python

So have an input file to script like as follows:
20248109|Generic|1|xxx|2|yyy|LINEA|68.66|68.67|True|2920958141272
.
.
.
21248109|Generic|3|xxx|4|www|LINEB|7618|7622|True|2920958281071.97
want the python script to iterate through and put LINEA into dictionary like as follows {{1:[68.66,68.67]},{3:[7618,7622]}}
here's as far as i've gotten:
Key = ["LINEA", "LINEB"]
fin = open(path)
test = []
for line in fin.readlines():
if True in [item in line for item in Key]:
test.append(line)
Any help at all would be fantastic.

First, you should use the csv module:
import csv
with open(path, "rb") as infile:
reader = csv.reader(infile, delimiter="|")
Then, you can iterate over the lines:
test = []
for row in reader:
if row[6] in Key:
test.append({int(row[2]): row[7:9]})

I would do this:
keys = ["LINEA", "LINEB"]
with open(path) as fin
answer = {line.partition("Generic|")[-1]:line for line in fin if any(key in line for key in keys)}
To edit your answer directly, you're actually quite close:
Key = ["LINEA", "LINEB"]
fin = open(path)
test = {} # dictionary
for line in fin.readlines():
if True in [item in line for item in Key]:
dict_key = line.partition("Generic|")[-1]
test[dict_key] = line

Related

Incorrectly reading lines of a text File in python

So basically i want to iterate the lines of a text file that has this format:
-----------------------------------------
Code: 0123456789
EGGS: 3 7.00 21.00
BACON: 1 3.50 3.50
COFFEE: 2 14.20 28.40
TOTAL: 52.90
-----------------------------------------
and i have the following code to read the lines one by one:
with open(filename, "rt", encoding="utf-8") as f:
for line in f:
prevline = line
line.split()
if '-' in line:
temp = f.readline().split(':') #Get Code
print(temp)
AFM = temp[1]
print(AFM)
else:
tempProducts = line.split(':') #Get Product in a list
productName = tempProducts[0] #Store Product Name in a variable
productStats = tempProducts[1] #Store Product Stats in a list
productStats = productStats.split(" ")
for value in productStats:
valueArray.append(float(value))
products.update({productName:valueArray})
if '-' in f.readline():
rec = Receipt(AFM,products)
products={}
valueArray=[]
receipts.append(rec)
else:
line=prevline
mind that i want to skip the line with the '------------' characters the code works but it keeps reading second line then fourth then sixth(code,bacon,total). The question is how can i fix this.Edit: there are multiple receipts in the file so i need each time to skip the line with the'----------'.
with open(filename, "rt", encoding="utf-8") as f:
old_list = [] # Saving all the lines including '----'
for line in f:
old_list.append(line)
new_list = old_list[1:-1] # new list which removes the '----' lines
You can iterate just through new_list with your .split logic.
See if this does the job
with open(filename, "rt", encoding="utf-8") as f:
valueArray = []
for line in f:
if not '-' in line:
if 'Code' in line:
AFM = line.split(':')[1]
print(AFM)
valueArray = []
products = {}
else:
tempProducts = line.split(':') # Get Product in a list
productName = tempProducts[0] # Store Product Name in a variable
productStats = tempProducts[1] # Store Product Stats in a list
productStats_list = productStats.split(" ")
for value in productStats:
valueArray.append(float(value))
products.update({productName: valueArray})
if 'TOTAL' in line:
rec = Receipt(AFM, products)
receipts.append(rec)
To anyone seeing this post now consider it closed i do not provide enough information and the code was messed up. Sorry for wasting your time

Why am I getting access denied when I try to replace files

I am trying to replace a file by basically copying a file named "test.csv" to "new.csv" But it cannot find test.csv even though its in the same working directory.
def cop(self):
with open('D:\\johnp\\kivy_venv\\betaapp2\\test.csv') as infile:
with open('D:\\johnp\\kivy_venv\\betaapp2\\new.csv', 'w') as outfile:
for line in infile:
# do things
outfile.write(line)
os.replace('D:\\johnp\\kivy_venv\\betaapp2\\new.csv', 'D:\\johnp\\kivy_venv\\betaapp2\\test.csv')
def sign_in(self, text_input):
self.text_input = text_input
count = 0
h = ""
d = ""
with open('D:\\johnp\\kivy_venv\\betaapp2\\test.csv', 'r') as fh:
reader = csv.reader(fh)
# get the headers out first, which is the first line
headers = next(reader)
for line in reader:
# this will make a dictionary with the header values as
# keys and the line entries as values
entry = dict(zip(headers, (l.strip() for l in line)))
# use key access, it makes the code a bit more readable
if entry['Name'] == text_input.strip():
if entry['Position'] == "Vice President":
line[8] = float(line[8]) + 3.5
self.cop()
self.signin()
else:
self.noUser()
The test.csv is supposed to be updated by running sign_in and then copying it to new.csv. And then replace test.csv with new.csv.
They are in the same directory, but you haven't instructed Python to check that:
import os
base_path = "D:\\johnp\\kivy_venv\\betaapp2\\"
with open(os.path.join(base_path, "test.csv")) as infile:
with open(os.path.join(base_path, "new.csv"), 'w') as outfile:
Without path Python just looks in the current working directory.

Python search a file for text using input from another file

I'm new to python and programming. I need some help with a python script. There are two files each containing email addresses (more than 5000 lines). Input file contains email addresses that I want to search in the data file(also contains email addresses). Then I want to print the output to a file or display on the console. I search for scripts and was able to modify but I'm not getting the desired results. Can you please help me?
dfile1 (50K lines)
yyy#aaa.com
xxx#aaa.com
zzz#aaa.com
ifile1 (10K lines)
ccc#aaa.com
vvv#aaa.com
xxx#aaa.com
zzz#aaa.com
Output file
xxx#aaa.com
zzz#aaa.com
datafile = 'C:\\Python27\\scripts\\dfile1.txt'
inputfile = 'C:\\Python27\\scripts\\ifile1.txt'
with open(inputfile, 'r') as f:
names = f.readlines()
outputlist = []
with open(datafile, 'r') as fd:
for line in fd:
name = fd.readline()
if name[1:-1] in names:
outputlist.append(line)
else:
print "Nothing found"
print outputlist
New Code
with open(inputfile, 'r') as f:
names = f.readlines()
outputlist = []
with open(datafile, 'r') as f:
for line in f:
name = f.readlines()
if name in names:
outputlist.append(line)
else:
print "Nothing found"
print outputlist
Maybe I'm missing something, but why not use a pair of sets?
#!/usr/local/cpython-3.3/bin/python
data_filename = 'dfile1.txt'
input_filename = 'ifile1.txt'
with open(input_filename, 'r') as input_file:
input_addresses = set(email_address.rstrip() for email_address in input_file.readlines())
with open(data_filename, 'r') as data_file:
data_addresses = set(email_address.rstrip() for email_address in data_file.readlines())
print(input_addresses.intersection(data_addresses))
mitan8 gives the problem you have, but this is what I would do instead:
with open(inputfile, "r") as f:
names = set(i.strip() for i in f)
output = []
with open(datafile, "r") as f:
for name in f:
if name.strip() in names:
print name
This avoids reading the larger datafile into memory.
If you want to write to an output file, you could do this for the second with statement:
with open(datafile, "r") as i, open(outputfile, "w") as o:
for name in i:
if name.strip() in names:
o.write(name)
Here's what I would do:
names=[]
outputList=[]
with open(inputfile) as f:
for line in f:
names.append(line.rstrip("\n")
myEmails=set(names)
with open(outputfile) as fd, open("emails.txt", "w") as output:
for line in fd:
for name in names:
c=line.rstrip("\n")
if name in myEmails:
print name #for console
output.write(name) #for writing to file
I think your issue stems from the following:
name = fd.readline()
if name[1:-1] in names:
name[1:-1] slices each email address so that you skip the first and last characters. While it might be good in general to skip the last character (a newline '\n'), when you load the name database in the "dfile"
with open(inputfile, 'r') as f:
names = f.readlines()
you are including newlines. So, don't slice the names in the "ifile" at all, i.e.
if name in names:
I think you can remove name = fd.readline() since you've already got the line in the for loop. It'll read another line in addition to the for loop, which reads one line every time. Also, I think name[1:-1] should be name, since you don't want to strip the first and last character when searching. with automatically closes the files opened.
PS: How I'd do it:
with open("dfile1") as dfile, open("ifile") as ifile:
lines = "\n".join(set(dfile.read().splitlines()) & set(ifile.read().splitlines())
print(lines)
with open("ofile", "w") as ofile:
ofile.write(lines)
In the above solution, basically I'm taking the union (elements part of both sets) of the lines of both the files to find the common lines.

Python Noob issue with populating dictionary from file. Then updating dict and writing back to file

The code below is supposed to lookup first column (key) from a file Dict_file and replace the first column of another file fr, with the value of the key found from dict_file. But it keeps the dict_file as an updated dictionary for future lookups.
Every time the code is run, it initializes a dictionary from that dict_file file. If it finds a new email address from another file, it adds it to the bottom of the dict_file.
It should work fine according to my understanding because if it doesn't find an # symbol it assigns looking_for the value of "Dummy#dummy.com".. Dummy#dummy.com should be appended to the bottom of dict_file.
But for some reason, I keep getting new lines and blank lines appended along with other new emails at the end of the dict_file. I can't be writing blanks and newlines to the end of the dict_file.
Why is this happening? Whats wrong in the code below, my brain is about to explode! Any help will be greatly appreciated!
#!/usr/bin/python
import sys
d = {}
line_list=[]
alist=[]
f = open(sys.argv[3], 'r') # Map file
for line in f:
alist = line.split()
key = alist[0]
value = alist[1]
d[str(key)] = str(value)
alist=[]
f.close()
fr = open(sys.argv[1], 'r') # source file
fw = open(sys.argv[2]+"/masked_"+sys.argv[1], 'w') # target file
for line in fr:
columns = line.split("|")
looking_for = columns[0] # this is what we need to search
if looking_for in d:
# by default, iterating over a dictionary will return keys
if not looking_for.find("#"):
looking_for == "Dummy#dummy.com"
new_line = d[looking_for]+'|'+'|'.join(columns[1:])
line_list.append(new_line)
else:
new_line = d[looking_for]+'|'+'|'.join(columns[1:])
line_list.append(new_line)
else:
new_idx = str(len(d)+1)
d[looking_for] = new_idx
kv = open(sys.argv[3], 'a')
kv.write("\n"+looking_for+" "+new_idx)
kv.close()
new_line = d[looking_for]+'|'+'|'.join(columns[1:])
line_list.append(new_line)
fw.writelines(line_list)
Here is the dict_file:
WHATEmail#SIMPLE.COM 223
SamHugan#CR.COM 224
SAMASHER#CATSTATIN.COM 225
FAKEEMAIL#SLOW.com 226
SUPERMANN#MYMY.COM 227
Here is the fr file that gets the first column turned into the id from the dict_file lookup:
WHATEmail#SIMPLE.COM|12|1|GDSP
FAKEEMAIL#SLOW.com|13|7|GDFP
MICKY#FAT.COM|12|1|GDOP
SUPERMANN#MYMY.COM|132|1|GUIP
MONITOR|132|1|GUIP
|132|1|GUIP
00 |12|34|GUILIGAN
Firstly, you need to ignore blanks in your initial dictionary read, otherwise you will get an index out of range error when you run this script again. Do the same when you read via the fr object to avoid entering nulls. Wrap your email check condition further out for greater scope. Do a simple check for the "#" using the find method. And you're good to go.
Try the below. This should work:
#!/usr/bin/python
import sys
d = {}
line_list=[]
alist=[]
f = open(sys.argv[3], 'r') # Persisted Dictionary File
for line in f:
line = line.strip()
if line =="":
continue
alist = line.split()
key = alist[0]
value = alist[1]
d[str(key)] = str(value)
alist=[]
f.close()
fr = open(sys.argv[1], 'r') # source file
fw = open(sys.argv[2]+"/masked_"+sys.argv[1], 'w') # Target Directory Location
for line in fr:
line = line.strip()
if line == "":
continue
columns = line.strip().split('|')
if columns[0].find("#") > 1:
looking_for = columns[0] # this is what we need to search
else:
looking_for = "Dummy#dummy.com"
if looking_for in d:
# by default, iterating over a dictionary will return keys
new_line = d[looking_for]+'|'+'|'.join(columns[1:])
line_list.append(new_line)
else:
new_idx = str(len(d)+1)
d[looking_for] = new_idx
kv = open(sys.argv[3], 'a')
kv.write(looking_for+" "+new_idx+'\n')
kv.close()
new_line = d[looking_for]+'|'+'|'.join(columns[1:])
line_list.append(new_line)
fw.writelines(line_list)

Return lines from file in order of a list

I've tried to put together a solution from similar questions but have failed miserably. I just don't know enough about Python yet :(
I have an inputlist containing elements in a particular order ex: ["GRE", "KIN", "ERD", "KIN"]
I have a datafile containing the elements, plus other data ex:
"ERD","Data","Data"...
"KIN","Data","Data"...
"FAC","Data","Data"...
"GRE","Data","Data"...
I need to create an outputlist that contains the lines from the datafile in the order they appear in the inputlist.
The code below returns the outputlist in the order the appear in the datafile, which is not the intended behavior... :-\
with open(inputfile, 'r') as f:
names = [line.strip() for line in f]
outputlist = []
with open(datafile, 'r') as f:
for line in f:
name = line.split(',')[0]
if name[1:-1] in names:
outputlist.append(line)
output = open(outputfile, 'w')
output.writelines(outputlist)
How can I have it return the list in the proper order? Thanks in advance for your help :-)
Edit
Thank's to Oscar, this is the solution I implemented:
datafile = 'C:\\testing\\bldglist.txt'
inputfile = 'C:\\testing\\inputlist.txt'
outputfile = "C:\\testing\\output.txt"
with open(inputfile, 'r') as f:
inputlist = [line.strip() for line in f]
def outputList(inputlist, datafile, outputfile):
d = {}
with open(datafile, 'r') as f:
for line in f:
line = line.strip()
key = line.split(',')[0]
d[key] = line
with open(outputfile, 'w') as f:
f.write('"Abbrev","Xcoord","Ycoord"\n')
for key in inputlist:
f.write(d[key] + '\n')
outputList(inputlist, datafile, outputfile)
This is the easy solution. It reads the entire input file into memory as a dictionary of first letter: line. It's then easy to write the lines in the write order.
If the file is very large (gigabytes) or you don't have a lot of memory, there are other ways. But they're not nearly as nice.
I haven't tested this.
import csv
data = {}
with open(datafile) as f:
for line in csv.reader(f):
data[line[0]] = line
with open(outputfile, "w") as f:
f = csv.writer(f)
for entry in inputlist:
f.writerow(data[entry])
Assuming a data file with this format:
"ERD","Data","Data"...
"KIN","Data","Data"...
"FAC","Data","Data"...
"GRE","Data","Data"...
Try this solution:
def outputList(inputlist, datafile, outputfile):
d = {}
with open(datafile, 'r') as f:
for line in f:
line = line.lstrip()
key = line.split(',')[0]
d[key] = line
with open(outputfile, 'w') as f:
for key in inputlist:
f.write(d[key])
Use it like this:
outputList(['"GRE"', '"KIN"', '"ERD"', '"KIN"'],
'/path/to/datafile',
'/path/to/outputfile')
It will write the output file with the expected order.
1) Create a list with the elements you wish to map to. In this case, ["GRE", "KIN", "ERD", "FAC"]
2) Read the file and map (using a dictionary of lists) the first elements.
3) Output to a file.
import csv
out_index=["GRE", "KIN", "ERD", "FAC"]
d={}
with open('/Users/andrew/bin/SO/abcd.txt','r') as fr:
for e in csv.reader(fr):
if e[0] not in d: d[e[0]]=[]
for ea in e[1:]:
d[e[0]].append(ea)
for i in out_index:
print i,":"
for e in d[i]:
print ' ',e
Given this example data:
"ERD","Data-a1","Data-a2"
"KIN","Data-b1","Data-b2"
"FAC","Data-c1","Data-c2"
"GRE","Data-d1","Data-d2"
"ERD","Data-a3","Data-a4"
"GRE","Data-d3","Data-d4"
Output:
GRE :
Data-d1
Data-d2
Data-d3
Data-d4
KIN :
Data-b1
Data-b2
ERD :
Data-a1
Data-a2
Data-a3
Data-a4
FAC :
Data-c1
Data-c2
Done!

Categories

Resources