I am making a flashcard program in which I take a text file that contains several columns, such as english word, french equivalent, gender, type of word, etc. My idea was to create a loop that read each line of the text file, separating by tabs, and makes an instance of a user-defined Word object for each line.
In the following block code I import the text file, process it into a list, then attempt to create an instance of a previously defined object: Word. I would like the object to have the second item on the list for it's name so that it is easily searchable, but it's not letting me do this, please can somebody help me with the code:
file = (open('dictionary.txt', 'r')).readline()
import re
line_list = re.split(r'\t', file.rstrip('\n'))
line_list[1] = Word(line_list[0], line_list[1], line_list[2], line_list[3])
Create a dict of instances and use the second item of the lists as key. It's a bad idea to create dynamic variables.
import re
instance_dict = {}
with open('dictionary.txt') as f:
for line in f:
line_list = re.split(r'\t', line.rstrip('\n'))
instance_dict[line_list[1]] = Word(*line_list[:4])
Why the with statement?
It is good practice to use the with keyword when dealing with file
objects. This has the advantage that the file is properly closed after
its suite finishes, even if an exception is raised on the way.
You can also use the csv module:
import csv
instances = {}
with open('dictionary.txt', 'rb') as f:
reader = csv.reader(f, delimiter='\t')
instances = {line[1]: Word(*line) for line in reader}
Here's a cleaner solution using a namedtuple. You'll end up with a dict called "words" which you use to lookup each by name.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pprint
from collections import namedtuple
Word = namedtuple('Word', ['name', 'french', 'gender', 'type_'])
words = {}
with open('dictionary.txt', 'rU') as fin:
for word in (Word(*r.rstrip('\n').split('\t')) for r in fin):
words[word.name] = word
pprint.pprint(words)
Firstly, it's better to use with, as statements to get input from files, as the closing procedures are automatically taken care of. Secondly, to read ALL of the lines from a file, you must use readlines() rather than readline(). Try something like this :
with open('dictionary.txt','r') as file :
line_list = file.readlines()
splitLineList = []
for lines in line_list :
splitLineList.append(re.split(r'\t',lines.strip('\n'))
You may have an appropriate solution depending on few clarification on your requirements
"My idea was to create a loop that read each line of the text file,
separating by tabs, and"
If the text file is already pre-validated or reliable to ignore error-handling (e.g. not evenly separated by single tabs).
with open('dictionary.txt', 'r') as f:
[line.strip().split("\t")
for line in f.read().split("\n")
if line.strip()]
will get you the (comprehensive) list required to create Word object instances, without using re
"then attempt to create an instance of a previously defined object:
Word."
with open('dictionary.txt', 'r') as f:
[Word(line.strip().split("\t"))
for line in f.read().split("\n")
if line.strip()]
"I would like the object to have the second item on the list for it's
name so that it is easily searchable,"
Can you rewrite this with an example?
but it's not letting me do this,
line_list[1] = Word(line_list[0], line_list[1], line_list[2], line_list[3])
Sorry I am loosing you here, why are using line_list[1] to refer newly created Word instances where line_list[1] itself is an argument ?
With your clarification, I would have something like this
Reworked Code:
from pprint import pprint
My assumption on your Class definition:
class Word():
def __init__(self, **kwargs):
self.set_attrs(**kwargs)
def __call__(self):
return self.get_attr("swedish_word")
def set_attrs(self, **kwargs):
for k, v in kwargs.iteritems():
setattr(self, k, v)
def get_attr(self, attr):
return getattr(self, attr)
def get_attrs(self):
return ({attr.upper():getattr(self, attr) for attr in self.__dict__.keys()})
def print_attrs(self):
pprint(self.get_attrs())
if __name__ == '__main__':
# sample entries in dictionary.txt
# swedish_word english_word article word_type
# hund dog ett noun
# katt cat ett noun
# sova sleep ett verb
with open('dictionary.txt', 'r') as f:
header = f.readline().strip().split("\t")
instances = [Word(**dict(zip(header, line.strip().split("\t"))))
for line in f.read().split("\n")
if line.strip()]
# for line in f.read().split("\n"):
# data = dict(zip(header, line.strip().split("\t")))
# w = Word(**data)
You can get instance properties for a given swedish_word like this
def print_swedish_word_properties(swedish_word):
for instance in instances:
if instance() == swedish_word:
print "Properties for Swedish Word:", swedish_word
instance.print_attrs()
print_swedish_word_properties("hund")
to have output like this
Properties for Swedish Word: hund
{'ARTICLE': 'ett',
'ENGLISH_WORD': 'dog',
'SWEDISH_WORD': 'hund',
'WORD_TYPE': 'noun'}
or you can use any other class methods to search instances on various attributes
Related
I'm very new to OOP and I have been trying to write a class I can import which will help me with parsing files. I realize I do not need to make a class to do this, but thought I'd try to so I can start getting familiar with OOP.
This code works
import re
import os
destdir = r"FilePathToDirectory"
class Parsey():
def GetNums(self,source, destination, trim = True):
with open (os.path.join(destdir,source), 'r') as infile:
with open (os.path.join(destdir,destination), 'w') as outfile:
for line in infile:
#Look for number patern match
if re.match(r'(.*)\d\d-\d\d-\d\d\d\d(.*)', line):
#If trim is True clean up the line
if trim == True:
#Find the first numeric character
firstdig = re.search("\d",line)
#Set the firstdig variable to the integer of that index
firstdig = firstdig.start()
#Set the line equal to only the begining and ending indexes of the number
line=line[firstdig:firstdig+10]
#Remove the dashes from the string
line = line.replace('-','')
outfile.writelines(line+'\n')
else:
outfile.writelines(line)
This code does not and I'm not sure why it doesn't.
import re
import os
class Parsey():
def __init__(self, destdir=''):
self.destdir = r"FilePathToDirectory"
def GetNums(self,source, destination, trim = True):
with open (os.path.join(destdir,source), 'r') as infile:
with open (os.path.join(destdir,destination), 'w') as outfile:
for line in infile:
#Look for number patern match
if re.match(r'(.*)\d\d-\d\d-\d\d\d\d(.*)', line):
#If trim is True clean up the line
if trim == True:
#Find the first numeric character
firstdig = re.search("\d",line)
#Set the firstdig variable to the integer of that index
firstdig = firstdig.start()
#Set the line equal to only the begining and ending indexes of the number
line=line[firstdig:firstdig+11]
#Remove the dashes from the string
line = line.replace('-','')
outfile.writelines(line+'\n')
else:
outfile.writelines(line)
I receive the error:
line 10, in GetNums
with open (os.path.join(destdir,source), 'r') as infile:
NameError: name 'destdir' is not defined
It was my understanding that the namespace of the class object would allow the functions within the class to see all variables declared in that class.
You need to change line 10 to:
with open (os.path.join(self.destdir, destination), 'w') as outfile:
In your case Python looks for testdir inside GetNums first and, if it cannot find it there, it will look for this name in the module. It does not magically use tesdir from __init__. The name self stands for the instance you will create later. So in __init__ you essentially set mysinstance.testdir and later in GetNums you can access with mysinstance.testdir. self is just the placeholder for mysinstance, i.e the instance you create later.
You can read the detail in the documentation.
#Mike Müller nailed it, but here is the corrected code in its entirety.
import re
import os
class Parsey():
def __init__(self, destdir=''):
self.destdir = r"FilePathToDirectory"
def GetNums(self,source, destination, trim = True):
with open (os.path.join(self.destdir,source), 'r') as infile:
with open (os.path.join(self.destdir,destination), 'w') as outfile:
for line in infile:
#Look for number patern match
if re.match(r'(.*)\d\d-\d\d-\d\d\d\d(.*)', line):
#If trim is True clean up the line
if trim == True:
#Find the first numeric character
firstdig = re.search("\d",line)
#Set the firstdig variable to the integer of that index
firstdig = firstdig.start()
#Set the line equal to only the begining and ending indexes of the number
line=line[firstdig:firstdig+10]
#Remove the dashes from the string
line = line.replace('-','')
outfile.writelines(line+'\n')
else:
outfile.writelines(line)
First I'd like to mention that I am completely new to Python and I've found it a bit difficult to transition from C++. I apologize if my question comes off as elementary.
I have a class for 'songs' which I have initialized as following. It takes in data from a file that contains a song's ID, name, genre etc. all separated by ::.
def __init__(self):
self.song_names = dict()
self.song_genres = dict()
def load_songs(self, song_id):
f = open(song_id)
for line in f:
line = line.rstrip()
component = line.split("::")
sid = components[0]
same= components[1]
sgenre=components[2]
self.song_names[mid] = sname
self.song_genres[mid] = sgenre
f.close()
The program also takes in data from a file with 'users' information, separated as
UserID::Gender::Age::Occupation::Zip etc. and a file with 'ratings'.
I would I implement a function like def set_song(sid, list((title,genres)))
and something like delete_song(sid) ?
I'm going to have to wind up doing a ton more other functions, but if someone could help me with those two - at least to have a better idea of structure and syntax - handling the others should be easier.
Why not just inherit from dict and use its interface? That way you can use Python's standard mapping operations instead of rolling your own:
class Songs(dict):
def load(self, song_id):
with open(song_id, 'r') as f:
for line in f:
sid, name, genre = line.rstrip().split('::')[:3]
self[sid] = [name, genre]
mysongs = Songs()
mysongs.load('barnes_and_barnes__fish_heads')
mysongs['barnes_and_barnes__fish_heads'] = ['roly poly', 'strange'] # set
del mysongs['barnes_and_barnes__fish_heads'] # delete
So, this is my code.
def classMaker(csv):
csv = csv.split("/n")
firstLine = csv[0]
csv = csv[1:]
class newClass():
def __init__(self, line):
self.vars = firstLine
for i in range(len(line)):
self[firstLine[i]] = line[i]
return [newClass(line) for line in csv]
The problem is an AttributeError in self[firstLine[i]] = line[i]. It says
AttributeError: newClass instance has no attribute '__setitem__'
I don't know why it is causing this error. My goal is to take in a csv file exported from Excel and auto-generate object names from field names.
Thank you in advance.
You can avoid the newClass all together if you use collections.namedtuple:
CSVRow = namedtuple("CSVRow", firstLine)
return [CSVRow(*line) for line in csv]
This assumes that the CSV headers will be valid Python identifiers (that is, if you have entires like "Some Value" this won't work if you don't process firstLine.
This will let you do things like this:
# Let's assume your CSV has a Name field
# and that it is the first column
csv_data[3].Name == csv_data[3][0]
# True
Also, you should look into the csv module to simplify CSV processing.
If I can infer your intent correctly, you want to replace this line:
self[firstLine[i]] = line[i]
with this:
setattr(self, firstline[i], line[i])
This will create an attribute of your newClass object named after the column in your data.
E.g.:
Name, Date, Weight
Joe, 23-Sep, 99
...
and
data = classMaker('file.csv')
will produce :
data[0].Name == 'Joe'
P.s. I assume that you will add file I/O, parsing the CSV file, and other missing elements.
P.p.s: You can avoid the loop counter i altogether:
for attr, val in zip(firstLine, line):
setattr(self, attr, val)
P.p.s: Here is a complete working sample:
import csv
def classMaker(filename):
class newClass(object):
def __init__(self, line):
for attr, val in zip(firstLine, line):
setattr(self, attr, val)
with open(filename, 'rb') as csvfile:
spamreader = csv.reader(csvfile)
firstLine = spamreader.next()
return [newClass(line) for line in spamreader]
x = classMaker("/tmp/x.csv")
print x[0].Name
Can someone help me figure out what I'm doing wrong?
I'm writing a python shell script that takes an ldif file and a csv file and then appends the contents in the csv file to the end of each record in the ldif. Something like:
Sample CSV:
"KEY","VALUE"
"abc","def"
"foo","bar"
"qwop","flop"
Sample .ldif:
dn: Aziz
cn: Aziz_09
dn: Carl
cn: Carl_04
After python myscript.py "sample.ldif" "sample.csv"
dn: Aziz
cn: Aziz_09
KEY: VALUE
abc: def
foo: bar
qwop: flop
dn: Carl
cn: Carl_04
KEY: VALUE
abc: def
foo: bar
qwop: flop
So far my code compiles however it doesn't modify the file correctly. I'm creating an object that takes a csv file path name string on creation and then stores the keys into a list field and stores the values into a list field. I then open the ldif file, parse for the escape characters between records and insert the list fields (KEY and VALUE) at the end of each record:
import sys, csv
# Make new object that can open a csv and set csv data in its arrays
class Container(object):
def __init__(self, filename=None, keys=None, values=None):
self.filename = filename
self.keys = []
self.values = []
# Opens self.filename and puts 0th and 1st rows into keys and values respectively
def csv_to_list():
with open(self.filename, 'rb') as f:
reader = csv.reader(f)
for row in reader:
self.keys = row[0]
self.values = row[1]
haruhi = Container("./content/test_pairs.txt")
haruhi.csv_to_list
# open first argument of the command line call to ldif_record_a.py for read/writing
with open(sys.argv[1],'r+') as f1:
lines=[x.strip() for x in f1] # Create list with each line as an element
f1.truncate(0)
f1.seek(0)
count = 0
for x in lines:
if x:
f1.write(x+'\n')
else:
f1.write("{0}: {1}\n\n".format(haruhi.keys[count] , haruhi.values[count]))
count = count + 1
f1.write("{0}: {1}\n\n".format(haruhi.keys[count] , haruhi.values[count]))
I am new to Python! Any help, advice and/or resource direction would be greatly appreciated! Thank you SO
Okay, I adhoc'd this, so it needs work, but here goes:
import csv
import re
csv_data = list(csv.reader(open('/home/jon/tmp/data.csv'))) # (1)
csv_text = '\n' + '\n'.join('{0} : {1}'.format(*row) for row in csv_data) # (2)
with open('/home/jon/tmp/other.ldif') as f:
contents = f.read() # (3)
print re.sub(r'(\n\n)|(\n$)', csv_text + '\n\n', contents) # (4)
(1) Read the CSV file into a list of lists
csv_data == [['KEY', 'VALUE'], ['abc', 'def'], ['foo', 'bar'], ['qwop', 'flop']]
(2) Create a text representation to be append to each ldif
KEY : VALUE
abc : def
foo : bar
qwop : flop
(3) Open and read the entire contents into memory (not very efficient mind you)
(4) Use a regular expression to find the "next bit" after the ldif and put in text
Prints:
dn: Aziz
cn: Aziz_09
KEY : VALUE
abc : def
foo : bar
qwop : flop
dn: Carl
cn: Carl_04
KEY : VALUE
abc : def
foo : bar
qwop : flop
You'll need to adjust it to write data back out or whatever you want..., but is a possible starting point - but strongly recommend you use it a base to work through accompanied by the Python manual. Feel free to ask for any clarification.
I know how to do it for a TXT file, but now I am having some trouble doing it for a CSV file.
How can I read a CSV file from the bottom in Python?
Pretty much the same way as for a text file: read the whole thing into a list and then go backwards:
import csv
with open('test.csv', 'r') as textfile:
for row in reversed(list(csv.reader(textfile))):
print ', '.join(row)
If you want to get fancy, you could write a lot of code that reads blocks starting at the end of the file and working backwards, emitting a line at a time, and then feed that to csv.reader, but that will only work with a file that can be seeked, i.e. disk files but not standard input.
Some of us have files that do not fit into memory, could anyone come with a solution that does not require storing the entire file in memory?
That's a bit trickier. Luckily, all csv.reader expects is an iterator-like object that returns a string (line) per call to next(). So we grab the technique Darius Bacon presented in "Most efficient way to search the last x lines of a file in python" to read the lines of a file backwards, without having to pull in the whole file:
import os
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
for block in reversed_blocks(file):
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def reversed_blocks(file, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
and feed reversed_lines into the code to reverse the lines before they get to csv.reader, removing the need for reversed and list:
import csv
with open('test.csv', 'r') as textfile:
for row in csv.reader(reversed_lines(textfile)):
print ', '.join(row)
There is a more Pythonic solution possible, which doesn't require a character-by-character reversal of the block in memory (hint: just get a list of indices where there are line ends in the block, reverse it, and use it to slice the block), and uses chain out of itertools to glue the line clusters from successive blocks together, but that's left as an exercise for the reader.
It's worth noting that the reversed_lines() idiom above only works if the columns in the CSV file don't contain newlines.
Aargh! There's always something. Luckily, it's not too bad to fix this:
def reversed_lines(file):
"Generate the lines of file in reverse order."
part = ''
quoting = False
for block in reversed_blocks(file):
for c in reversed(block):
if c == '"':
quoting = not quoting
elif c == '\n' and part and not quoting:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
Of course, you'll need to change the quote character if your CSV dialect doesn't use ".
Building on #mike-desimone 's answer. Here's a solution that provides the same structure as a python file object but is read in reverse, line by line:
import os
class ReversedFile(object):
def __init__(self, f, mode='r'):
"""
Wraps a file object with methods that make it be read in reverse line-by-line
if ``f`` is a filename opens a new file object
"""
if mode != 'r':
raise ValueError("ReversedFile only supports read mode (mode='r')")
if not type(f) == file:
# likely a filename
f = open(f)
self.file = f
self.lines = self._reversed_lines()
def _reversed_lines(self):
"Generate the lines of file in reverse order."
part = ''
for block in self._reversed_blocks():
for c in reversed(block):
if c == '\n' and part:
yield part[::-1]
part = ''
part += c
if part: yield part[::-1]
def _reversed_blocks(self, blocksize=4096):
"Generate blocks of file's contents in reverse order."
file = self.file
file.seek(0, os.SEEK_END)
here = file.tell()
while 0 < here:
delta = min(blocksize, here)
here -= delta
file.seek(here, os.SEEK_SET)
yield file.read(delta)
def __getattribute__(self, name):
"""
Allows for the underlying file attributes to come through
"""
try:
# ReversedFile attribute
return super(ReversedFile, self).__getattribute__(name)
except AttributeError:
# self.file attribute
return getattr(self.file, name)
def __iter__(self):
"""
Creates iterator
"""
return self
def seek(self):
raise NotImplementedError('ReversedFile does not support seek')
def next(self):
"""
Next item in the sequence
"""
return self.lines.next()
def read(self):
"""
Returns the entire contents of the file reversed line by line
"""
contents = ''
for line in self:
contents += line
return contents
def readline(self):
"""
Returns the next line from the bottom
"""
return self.next()
def readlines(self):
"""
Returns all remaining lines from the bottom of the file in reverse
"""
return [x for x in self]
Go for it. This is simple program to reverse the rows from a CSV file.
import csv
BC_file = open('Master.csv', 'rb')
BC_reader = csv.reader(BC_file)
next(BC_reader)
for row in reversed(list(BC_reader)):
print row[0]