problem with seek() in file handling - python

I am using seek in a file- the file has a bunch of filenames and some logs of a process done on the file- some of these logs have errors. I am going line by line, if i get an error, I want to log everything in between two filenames.
When I use seek, I think that instead of moving it to the line i want it to, it moves it to the character # instead. For example
f=open("fileblah",'r')
while f:
line=f.readline()
counter=counter+1
f.seek(tail_position) # i want the next loop to start from after the error happened.
if line.startswith("D:")
header_position=counter
error_flag=0 #unset error flag
if line.startswith("error")
error_flag=1 #set error_flag
while(not(line.startswith("D:"): #go until next file beginning
line=f.readline()
counter=counter+1
tail_position=counter #have come to the next filename
I can see this is highly inefficient, but it doesn't work at all, because f.seek(tail_position) is moving the file pointer to the character # instead of the line #

Use .tell() to store your start-of-line position, then you can .seek() back to it.
Edit: I think this is what you want:
def errorsInLog(fname, newfileStr='D:', iserrorStr='error'):
with open(fname) as inf:
prev = pos = inf.tell()
line = inf.readline()
error = False
while line:
if line.startswith(newfileStr):
if error:
inf.seek(prev)
yield(inf.read(pos-prev))
prev = pos
error = False
elif line.startswith(iserrorStr):
error = True
pos = inf.tell()
line = inf.readline()
if error:
inf.seek(prev)
yield(inf.read())
def main():
print('\n\n'.join(errorsInLog('fileblah')))
For each filename followed by an error it returns a string encompassing the filename and all following lines, up to but not including the next filename or end-of-file.

seek() is used more often in random access file reading. If the file being read is already text and in can be read line by line then you need only read the line and then operate on the line using string operations. There is no need to move the file read position.
Your code need only look like this:
for line in f:
do_stuff_with line

like stdio's fseek(),seek(offset[,whence]) sets offset of the current position. whence is defaults to 0.so you can do something like this:
while(not(line.startwith("D:"))):
fseek(tail_position,'\n')
tail_position ++

Related

Not able to clear file Contents

I have a file which has below data.
edit 48
set dst 192.168.4.0 255.255.255.0
set device "Tague-VPN"
set comment "Yeshtel"
edit 180
set dst 64.219.107.45 255.255.255.255
set device "Austin-Backup"
set comment "images.gsmc.org"
I want to copy the commands under edit only if Set device is Austin-Backup.
string = 'set device'
word = '"Austin-Backup"'
with open('test.txt') as oldfile, open('script.txt', 'w') as newfile:
for line in oldfile:
newfile.write(line)
newfile.write('\n')
if string not in line:
pass
elif string in line:
if word not in line:
a = open('script.txt', 'w')
a.close()
else:
pass
I am trying to write test file content to new file(script) and if command "set comment "Yeshtel"" is found i want to delete contents in new file. I tried to delete but its not happening. I am new to Python, Can you please tell what is the Prob??
I got to know that reopening the same file in Write mode will clear the contents..
I suspect the issue is that you have the same file open twice, once as newfile and a second time as a. While it should be truncated when you open it as a and then close it, the writes you made on newfile may still appear if the filesystem had cached them until after the truncated version was written.
I suggest only opening the file once. When you need to truncate it, call the truncate method on it.
if word not in line:
newfile.truncate()
If you might write more to the file after truncating, you should probably also seek back to the start position (e.g. newfile.seek(0)). If you're going to be done with the file after truncating it, that step is not needed.
Should be something like this
temp_lines = []
last_line_was_edit = False
found_keyword = False
keyword = "Austin-Backup"
with open('test.txt') as oldfile, open('script.txt', 'w') as newfile:
for line in oldfile:
if last_line_was_edit and temp_lines:
if found_keyword:
newfile.writelines(temp_lines)
temp_lines = []
if line.startswith("edit"):
last_line_was_edit = True
else:
if keyword in line:
found_keyword = True
temp_lines.append(line)
Please note that you should not open the file twice. Just use an temporary variable and write only what have to be written

How to remove lines from a file that is being printed with certain conditions?

def codeOnly (file):
'''Opens a file and prints the content excluding anything with a hash in it'''
f = open('boring.txt','r')
codecontent = f.read()
print(codecontent)
codeOnly('boring.txt')
I want to open this file and print the contents of it however i don't want to print any lines with hashes in them. Is there a function to prevent these lines from being printed?
The following script with print all lines which do not contain a #:
def codeOnly(file):
'''Opens a file and prints the content excluding anything with a hash in it'''
with open(file, 'r') as f_input:
for line in f_input:
if '#' not in line:
print(line, end='')
codeOnly('boring.txt')
Using with will ensure that the file is automatically closed afterwards.
You can check if the line contains a hash with not '#' in codecontent (using in):
def codeOnly (file):
'''Opens a file and prints the content excluding anything with a hash in it'''
f = open('boring.txt','r')
for line in f:
if not '#' in line:
print(line)
codeOnly('boring.txt')
If you really want to keep only code lines, you might want to keep the part of the line until the hash, because in languages such as python you could have code before the hash, for example:
print("test") # comments
You can find the index
for line in f:
try:
i = line.index('#')
line = line[:i]
except ValueError:
pass # don't change line
Now each of your lines will contain no text from and including the hash tag until the end of the line. Hash tags in the first position of a line will result in an empty string, you might want to handle that.

Why do I get "Pickle - EOFError: Ran out of input" reading an empty file?

I am getting an interesting error while trying to use Unpickler.load(), here is the source code:
open(target, 'a').close()
scores = {};
with open(target, "rb") as file:
unpickler = pickle.Unpickler(file);
scores = unpickler.load();
if not isinstance(scores, dict):
scores = {};
Here is the traceback:
Traceback (most recent call last):
File "G:\python\pendu\user_test.py", line 3, in <module>:
save_user_points("Magix", 30);
File "G:\python\pendu\user.py", line 22, in save_user_points:
scores = unpickler.load();
EOFError: Ran out of input
The file I am trying to read is empty.
How can I avoid getting this error, and get an empty variable instead?
Most of the answers here have dealt with how to mange EOFError exceptions, which is really handy if you're unsure about whether the pickled object is empty or not.
However, if you're surprised that the pickle file is empty, it could be because you opened the filename through 'wb' or some other mode that could have over-written the file.
for example:
filename = 'cd.pkl'
with open(filename, 'wb') as f:
classification_dict = pickle.load(f)
This will over-write the pickled file. You might have done this by mistake before using:
...
open(filename, 'rb') as f:
And then got the EOFError because the previous block of code over-wrote the cd.pkl file.
When working in Jupyter, or in the console (Spyder) I usually write a wrapper over the reading/writing code, and call the wrapper subsequently. This avoids common read-write mistakes, and saves a bit of time if you're going to be reading the same file multiple times through your travails
I would check that the file is not empty first:
import os
scores = {} # scores is an empty dict already
if os.path.getsize(target) > 0:
with open(target, "rb") as f:
unpickler = pickle.Unpickler(f)
# if file is not empty scores will be equal
# to the value unpickled
scores = unpickler.load()
Also open(target, 'a').close() is doing nothing in your code and you don't need to use ;.
It is very likely that the pickled file is empty.
It is surprisingly easy to overwrite a pickle file if you're copying and pasting code.
For example the following writes a pickle file:
pickle.dump(df,open('df.p','wb'))
And if you copied this code to reopen it, but forgot to change 'wb' to 'rb' then you would overwrite the file:
df=pickle.load(open('df.p','wb'))
The correct syntax is
df=pickle.load(open('df.p','rb'))
As you see, that's actually a natural error ..
A typical construct for reading from an Unpickler object would be like this ..
try:
data = unpickler.load()
except EOFError:
data = list() # or whatever you want
EOFError is simply raised, because it was reading an empty file, it just meant End of File ..
You can catch that exception and return whatever you want from there.
open(target, 'a').close()
scores = {};
try:
with open(target, "rb") as file:
unpickler = pickle.Unpickler(file);
scores = unpickler.load();
if not isinstance(scores, dict):
scores = {};
except EOFError:
return {}
if path.exists(Score_file):
try :
with open(Score_file , "rb") as prev_Scr:
return Unpickler(prev_Scr).load()
except EOFError :
return dict()
Had the same issue. It turns out when I was writing to my pickle file I had not used the file.close(). Inserted that line in and the error was no more.
I have encountered this error many times and it always occurs because after writing into the file, I didn't close it. If we don't close the file the content stays in the buffer and the file stays empty.
To save the content into the file, either file should be closed or file_object should go out of scope.
That's why at the time of loading it's giving the ran out of input error because the file is empty. So you have two options :
file_object.close()
file_object.flush(): if you don't wanna close your file in between the program, you can use the flush() function as it will forcefully move the content from the buffer to the file.
This error comes when your pickle file is empty (0 Bytes). You need to check the size of your pickle file first. This was the scenario in my case. Hope this helps!
Note that the mode of opening files is 'a' or some other have alphabet 'a' will also make error because of the overwritting.
pointer = open('makeaafile.txt', 'ab+')
tes = pickle.load(pointer, encoding='utf-8')
temp_model = os.path.join(models_dir, train_type + '_' + part + '_' + str(pc))
# print(type(temp_model)) # <class 'str'>
filehandler = open(temp_model, "rb")
# print(type(filehandler)) # <class '_io.BufferedReader'>
try:
pdm_temp = pickle.load(filehandler)
except UnicodeDecodeError:
pdm_temp = pickle.load(filehandler, fix_imports=True, encoding="latin1")
from os.path import getsize as size
from pickle import *
if size(target)>0:
with open(target,'rb') as f:
scores={i:j for i,j in enumerate(load(f))}
else: scores={}
#line 1.
we importing Function 'getsize' from Library 'OS' sublibrary 'path' and we rename it with command 'as' for shorter style of writing. Important is hier that we loading only one single Func that we need and not whole Library!
line 2.
Same Idea, but when we dont know wich modul we will use in code at the begining, we can import all library using a command '*'.
line 3.
Conditional Statement... if size of your file >0 ( means obj is not an empty). 'target' is variable that schould be a bit earlier predefined.
just an Example : target=(r'd:\dir1\dir.2..\YourDataFile.bin')
Line 4.
'With open(target) as file:' an open construction for any file, u dont need then to use file.close(). it helps to avoid some typical Errors such as "Run out of input" or Permissions rights.
'rb' mod means 'rea binary' that u can only read(load) the data from your binary file but u cant modify/rewrite it.
Line5.
List comprehension method in applying to a Dictionary..
line 6. Case your datafile is empty, it will not raise an any Error msg, but return just an empty dictionary.

Python read huge file line by line with utf-8 encoding

I want to read some quite huge files(to be precise: the google ngram 1 word dataset) and count how many times a character occurs. Now I wrote this script:
import fileinput
files = ['../../datasets/googlebooks-eng-all-1gram-20090715-%i.csv' % value for value in range(0,9)]
charcounts = {}
lastfile = ''
for line in fileinput.input(files):
line = line.strip()
data = line.split('\t')
for character in list(data[0]):
if (not character in charcounts):
charcounts[character] = 0
charcounts[character] += int(data[1])
if (fileinput.filename() is not lastfile):
print(fileinput.filename())
lastfile = fileinput.filename()
if(fileinput.filelineno() % 100000 == 0):
print(fileinput.filelineno())
print(charcounts)
which works fine, until it reaches approx. line 700.000 of the first file, I then get this error:
../../datasets/googlebooks-eng-all-1gram-20090715-0.csv
100000
200000
300000
400000
500000
600000
700000
Traceback (most recent call last):
File "charactercounter.py", line 5, in <module>
for line in fileinput.input(files):
File "C:\Python31\lib\fileinput.py", line 254, in __next__
line = self.readline()
File "C:\Python31\lib\fileinput.py", line 349, in readline
self._buffer = self._file.readlines(self._bufsize)
File "C:\Python31\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 7771: cha
racter maps to <undefined>
To solve this I searched the web a bit, and came up with this code:
import fileinput
files = ['../../datasets/googlebooks-eng-all-1gram-20090715-%i.csv' % value for value in range(0,9)]
charcounts = {}
lastfile = ''
for line in fileinput.input(files,False,'',0,'r',fileinput.hook_encoded('utf-8')):
line = line.strip()
data = line.split('\t')
for character in list(data[0]):
if (not character in charcounts):
charcounts[character] = 0
charcounts[character] += int(data[1])
if (fileinput.filename() is not lastfile):
print(fileinput.filename())
lastfile = fileinput.filename()
if(fileinput.filelineno() % 100000 == 0):
print(fileinput.filelineno())
print(charcounts)
but the hook I now use tries to read the entire, 990MB, file into the memory at once, which kind of crashes my pc. Does anyone know how to rewrite this code so that it actually works?
p.s: the code hasn't even run all the way yet, so I don't even know if it does what it has to do, but for that to happen I first need to fix this bug.
Oh, and I use Python 3.2
I do not know why fileinput does not work as expected.
I suggest you use the open function instead. The return value can be iterated over and will return lines, just like fileinput.
The code will then be something like:
for filename in files:
print(filename)
for filelineno, line in enumerate(open(filename, encoding="utf-8")):
line = line.strip()
data = line.split('\t')
# ...
Some documentation links: enumerate, open, io.TextIOWrapper (open returns an instance of TextIOWrapper).
The problem is that fileinput doesn't use file.xreadlines(), which reads line by line, but file.readline(bufsize), which reads bufsize bytes at once (and turns that into a list of lines). You are providing 0 for the bufsize parameter of fileinput.input() (which is also the default value). Bufsize 0 means that the whole file is buffered.
Solution: provide a reasonable bufsize.
This works for me: you can use "utf-8" in the hook definition. I used it on a 50GB/200M lines file with no problem.
fi = fileinput.FileInput(openhook=fileinput.hook_encoded("iso-8859-1"))
Could you try to read not a whole file, but a part of it as binary, then decode(), then proccess, then call the function again to read another part?
I don't if the one I have is the latest version (and I don't remember how I read them), but...
$ file -i googlebooks-eng-1M-1gram-20090715-0.csv
googlebooks-eng-1M-1gram-20090715-0.csv: text/plain; charset=us-ascii
Have you tried fileinput.hook_encoded('ascii') or fileinput.hook_encoded('latin_1')? Not sure why this would make a difference, since I think the these are just subsets of unicode with the same mapping, but worth a try.
EDIT I think this might be a bug in fileinput, neither of these work.
If you are worried about the mem usage, why not read by line using readline()? This will get rid of the memory issues you are running into. Currently you are reading the full file before performing any actions on the fileObj, with readline() you are not saving the data, merely searching it on a per-line basis.
def charCount1(_file, _char):
result = []
file = open(_file, encoding="utf-8")
data = file.read()
file.close()
for index, line in enumerate(data.split("\n")):
if _char in line:
result.append(index)
return result
def charCount2(_file, _char):
result = []
count = 0
file = open(_file, encoding="utf-8")
while 1:
line = file.readline()
if _char in line:
result.append(count)
count += 1
if not line: break
file.close()
return result
I didn't have a chance to really look over your code but the above samples should give you an idea of how to make the appropriate changes to your structure. charCount1() demonstrates your method which caches the entire file in a single call from read(). I tested your method out on a +400MB text file and the python.exe process went as high as +900MB. when you run charCount2(), the python.exe process shouldn't exceed more than a few MB's (provided you haven't bulked up the size with other code) ;)

How to modify a text file?

I'm using Python, and would like to insert a string into a text file without deleting or copying the file. How can I do that?
Unfortunately there is no way to insert into the middle of a file without re-writing it. As previous posters have indicated, you can append to a file or overwrite part of it using seek but if you want to add stuff at the beginning or the middle, you'll have to rewrite it.
This is an operating system thing, not a Python thing. It is the same in all languages.
What I usually do is read from the file, make the modifications and write it out to a new file called myfile.txt.tmp or something like that. This is better than reading the whole file into memory because the file may be too large for that. Once the temporary file is completed, I rename it the same as the original file.
This is a good, safe way to do it because if the file write crashes or aborts for any reason, you still have your untouched original file.
Depends on what you want to do. To append you can open it with "a":
with open("foo.txt", "a") as f:
f.write("new line\n")
If you want to preprend something you have to read from the file first:
with open("foo.txt", "r+") as f:
old = f.read() # read everything in the file
f.seek(0) # rewind
f.write("new line\n" + old) # write the new line before
The fileinput module of the Python standard library will rewrite a file inplace if you use the inplace=1 parameter:
import sys
import fileinput
# replace all occurrences of 'sit' with 'SIT' and insert a line after the 5th
for i, line in enumerate(fileinput.input('lorem_ipsum.txt', inplace=1)):
sys.stdout.write(line.replace('sit', 'SIT')) # replace 'sit' and write
if i == 4: sys.stdout.write('\n') # write a blank line after the 5th line
Rewriting a file in place is often done by saving the old copy with a modified name. Unix folks add a ~ to mark the old one. Windows folks do all kinds of things -- add .bak or .old -- or rename the file entirely or put the ~ on the front of the name.
import shutil
shutil.move(afile, afile + "~")
destination= open(aFile, "w")
source= open(aFile + "~", "r")
for line in source:
destination.write(line)
if <some condition>:
destination.write(<some additional line> + "\n")
source.close()
destination.close()
Instead of shutil, you can use the following.
import os
os.rename(aFile, aFile + "~")
Python's mmap module will allow you to insert into a file. The following sample shows how it can be done in Unix (Windows mmap may be different). Note that this does not handle all error conditions and you might corrupt or lose the original file. Also, this won't handle unicode strings.
import os
from mmap import mmap
def insert(filename, str, pos):
if len(str) < 1:
# nothing to insert
return
f = open(filename, 'r+')
m = mmap(f.fileno(), os.path.getsize(filename))
origSize = m.size()
# or this could be an error
if pos > origSize:
pos = origSize
elif pos < 0:
pos = 0
m.resize(origSize + len(str))
m[pos+len(str):] = m[pos:origSize]
m[pos:pos+len(str)] = str
m.close()
f.close()
It is also possible to do this without mmap with files opened in 'r+' mode, but it is less convenient and less efficient as you'd have to read and temporarily store the contents of the file from the insertion position to EOF - which might be huge.
As mentioned by Adam you have to take your system limitations into consideration before you can decide on approach whether you have enough memory to read it all into memory replace parts of it and re-write it.
If you're dealing with a small file or have no memory issues this might help:
Option 1)
Read entire file into memory, do a regex substitution on the entire or part of the line and replace it with that line plus the extra line. You will need to make sure that the 'middle line' is unique in the file or if you have timestamps on each line this should be pretty reliable.
# open file with r+b (allow write and binary mode)
f = open("file.log", 'r+b')
# read entire content of file into memory
f_content = f.read()
# basically match middle line and replace it with itself and the extra line
f_content = re.sub(r'(middle line)', r'\1\nnew line', f_content)
# return pointer to top of file so we can re-write the content with replaced string
f.seek(0)
# clear file content
f.truncate()
# re-write the content with the updated content
f.write(f_content)
# close file
f.close()
Option 2)
Figure out middle line, and replace it with that line plus the extra line.
# open file with r+b (allow write and binary mode)
f = open("file.log" , 'r+b')
# get array of lines
f_content = f.readlines()
# get middle line
middle_line = len(f_content)/2
# overwrite middle line
f_content[middle_line] += "\nnew line"
# return pointer to top of file so we can re-write the content with replaced string
f.seek(0)
# clear file content
f.truncate()
# re-write the content with the updated content
f.write(''.join(f_content))
# close file
f.close()
Wrote a small class for doing this cleanly.
import tempfile
class FileModifierError(Exception):
pass
class FileModifier(object):
def __init__(self, fname):
self.__write_dict = {}
self.__filename = fname
self.__tempfile = tempfile.TemporaryFile()
with open(fname, 'rb') as fp:
for line in fp:
self.__tempfile.write(line)
self.__tempfile.seek(0)
def write(self, s, line_number = 'END'):
if line_number != 'END' and not isinstance(line_number, (int, float)):
raise FileModifierError("Line number %s is not a valid number" % line_number)
try:
self.__write_dict[line_number].append(s)
except KeyError:
self.__write_dict[line_number] = [s]
def writeline(self, s, line_number = 'END'):
self.write('%s\n' % s, line_number)
def writelines(self, s, line_number = 'END'):
for ln in s:
self.writeline(s, line_number)
def __popline(self, index, fp):
try:
ilines = self.__write_dict.pop(index)
for line in ilines:
fp.write(line)
except KeyError:
pass
def close(self):
self.__exit__(None, None, None)
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
with open(self.__filename,'w') as fp:
for index, line in enumerate(self.__tempfile.readlines()):
self.__popline(index, fp)
fp.write(line)
for index in sorted(self.__write_dict):
for line in self.__write_dict[index]:
fp.write(line)
self.__tempfile.close()
Then you can use it this way:
with FileModifier(filename) as fp:
fp.writeline("String 1", 0)
fp.writeline("String 2", 20)
fp.writeline("String 3") # To write at the end of the file
If you know some unix you could try the following:
Notes: $ means the command prompt
Say you have a file my_data.txt with content as such:
$ cat my_data.txt
This is a data file
with all of my data in it.
Then using the os module you can use the usual sed commands
import os
# Identifiers used are:
my_data_file = "my_data.txt"
command = "sed -i 's/all/none/' my_data.txt"
# Execute the command
os.system(command)
If you aren't aware of sed, check it out, it is extremely useful.

Categories

Resources