Huffman encoding: how to write binary data in Python - python

I have tried methods using the struct module, as shown by the lines commented out in my code, but it didn't work out. Basically I have two options: I can either write the binary data code by code (my code are sequences of bits of length varying from 3 to 13 bits), or convert the whole string of n characters (n=25000+ in this case) to binary data. But I don't know how to implement either methods. Code:
import heapq
import binascii
import struct
def createFrequencyTupleList(inputFile):
frequencyDic = {}
intputFile = open(inputFile, 'r')
for line in intputFile:
for char in line:
if char in frequencyDic.keys():
frequencyDic[char] += 1
else:
frequencyDic[char] = 1
intputFile.close()
tupleList = []
for myKey in frequencyDic:
tupleList.append((frequencyDic[myKey],myKey))
return tupleList
def createHuffmanTree(frequencyList):
heapq.heapify(frequencyList)
n = len(frequencyList)
for i in range(1,n):
left = heapq.heappop(frequencyList)
right = heapq.heappop(frequencyList)
newNode = (left[0] + right[0], left, right)
heapq.heappush(frequencyList, newNode)
return frequencyList[0]
def printHuffmanTree(myTree, someCode,prefix=''):
if len(myTree) == 2:
someCode.append((myTree[1] + "#" + prefix))
else:
printHuffmanTree(myTree[1], someCode,prefix + '0')
printHuffmanTree(myTree[2], someCode,prefix + '1')
def parseCode(char, myCode):
for k in myCode:
if char == k[0]:
return k[2:]
if __name__ == '__main__':
myList = createFrequencyTupleList('input')
myHTree = createHuffmanTree(myList)
myCode = []
printHuffmanTree(myHTree, myCode)
inputFile = open('input', 'r')
outputFile = open('encoded_file2', "w+b")
asciiString = ''
n=0
for line in inputFile:
for char in line:
#outputFile.write(parseCode(char, myCode))
asciiString += parseCode(char, myCode)
n += len(parseCode(char, myCode))
#values = asciiString
#print n
#s = struct.Struct('25216s')
#packed_data = s.pack(values)
#print packed_data
inputFile.close()
#outputFile.write(packed_data)
outputFile.close()

You're looking for this:
packed_data = ''.join(chr(int(asciiString[i:i+8], 2))
for i in range(0, len(asciiString), 8))
It will take 8 bits at a time from the asciiString, interpret it as an integer, and output the corresponding byte.
Your problem here is that this requires the length of asciiString to be a multiple of 8 bits to work correctly. If not, you'll insert zero bits before the last few real bits.
So you need to store the number of bits in the last byte somewhere, so you know to ignore those bits when you get them back, instead of interpreting them as zeros. You could try:
packed_data = chr(len(asciiString) % 8) + packed_data
Then when you read it back:
packed_input = coded_file.read()
last_byte_length, packed_input, last_byte = (packed_input[0],
packed_input[1:-1],
packed_input[-1])
if not last_byte_length: last_byte_length = 8
ascii_input = ''.join(chain((bin(ord(byte))[2:].zfill(8) for byte in packed_input),
tuple(bin(ord(last_byte))[2:].zfill(last_byte_length),)))
# OR
# ascii_input = ''.join(chain(('{0:0=8b}'.format(byte) for byte in packed_input),
# tuple(('{0:0=' + str(last_byte_length) + '8b}').format(last_byte),)))
Edit: You either need to strip '0b' from the strings returned by bin() or, on 2.6 or newer, preferably use the new, alternate versions I added that use string formatting instead of bin(), slicing, and zfill().
Edit: Thanks eryksun, good to use chain to avoid making a copy of the ASCII string. Also, need to call ord(byte) in the bin() version.

Related

How Does The Base64 Work and How To Encode/Decode in it?

I have a problem that asks me to encode a string to base64 format! I think I got it based on my code! The string: "Man" works and other short strings work. But the string: "this is a string!!" doesn't work! And also I want to use the non-padding version. The questions asks me to use the non-padding version. Can you please explain the process of how to encode this string: "this is a string!!"! I have to turn the letters to ascii, and then turn them into binary and divide them into 6 bytes and then turn them to decimal and refer to a chart of ascii and then use them. This is all I know! But, please don't give me the code. I want to try out the coding on my own. But please explain the process. There are no good videos explaining this topic! And by the way, I am using python Thank you
Here is the code I have:
def decimal(binary):
binary = str(binary); power = len(binary)-1
values = []
for x in binary:
if x == "1":
values.append((x, 2**power))
power -= 1
return sum([v for b,v in values if b == "1"])
string = "Man"
byte = ""
for x in string:
byte += bin(ord(x))[0] + bin(ord(x))[2:]
values = []
for x in range(0, len(byte), 6):
values.append(byte[x:x+6])
abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
table = {x:abc[x] for x in range(len(abc))}
print("".join(table[decimal(x)] for x in values))
I am using python!
Adjusted parts are explained using in-line comments:
import sys # merely for manipulation with supplied arguments
import math
if len(sys.argv) == 1:
string = "This is a string!!!"
else:
string = ' '.join([sys.argv[i] for i in range(1,len(sys.argv))])
def decimal(binary):
binary = str(binary); power = len(binary)-1
values = []
for x in binary:
if x == "1":
values.append((x, 2**power))
power -= 1
return sum([v for b,v in values if b == "1"])
byte = ""
for x in string.encode('utf-8'): # ASCII is a proper subset of UTF-8
byte += bin(x)[2:].rjust(8,'0') # get binary string of length 8
byte = byte.ljust(math.ceil(len(byte)/6)*6,'0') # length must be divisible by 6
values = []
for x in range(0, len(byte), 6):
values.append(byte[x:x+6])
abc = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
table = {x:abc[x] for x in range(len(abc))}
print(string) # input
padding = '=' * (((3 - len(string.encode('utf-8'))) % 3) % 3)
ooutput = "".join(table[decimal(x)] for x in values)
print(ooutput)
print(ooutput + padding) # for the sake of completeness
import base64 # merely for comparison/reference output
# ↓↓↓ output from base64 module ↓↓↓
print(base64.b64encode(string.encode('utf-8')).decode('utf-8'))
Output: .\SO\66724448.py ěščř ĚŠČŘ & .\SO\66724448.py
ěščř ĚŠČŘ
xJvFocSNxZkgxJrFoMSMxZg
xJvFocSNxZkgxJrFoMSMxZg=
xJvFocSNxZkgxJrFoMSMxZg=
This is a string!!!
VGhpcyBpcyBhIHN0cmluZyEhIQ
VGhpcyBpcyBhIHN0cmluZyEhIQ==
VGhpcyBpcyBhIHN0cmluZyEhIQ==

python operators = and == counter

i wrote a .txt file in which I put operators = and ==. I wrote a code which will count number of = and ==, but i dont get correct number.
lexicalClass = file.readlines()
for lex in lexicalClass:
newList = re.findall('\S+', lex)
for element in newList:
if len(re.findall('[a-z]+|[0-9]+', element)):
identifiers.append(re.findall('[a-z]+|[0-9]+', element))
num = len(re.findall('\=', element))
if int(num):
if int(num) % 2 == 1:
for i in range(int((num-1)/2)):
equal.append('==')
assignment.append('=')
else:
for i in range(int(num/2)):
equal.append('==')
print(str(len(equal)))
print(str(len(assignment)))
My .txt file : a == b a = b c = d
And as you can see my output should be 1 and 2, but im getting 0 in both.
You could probably do this with lookahead and lookbehind assertions:
one_equals = r"(?<!=)=(?!=)" # a "=" not followed or preceded by a =
two_equals = r"(?<!=)==(?!=)" # "==" not followed or preceded by a =
assignment = 0
equals = 0
with open("yourfilename.txt") as f:
for line in f:
equal += len(re.findall(one_equals, line))
assignment += len(re.findall(two_equals, line))
If this is Python source code, the correct way to do this is with the ast module, using ast.walk() and counting instances of the ast.Assign and ast.Eq nodes:
import ast
with open("yourfilename.txt") as f:
parsed_source = ast.parse(f.read())
nodes = list(ast.walk(parsed_source))
equals = sum(isinstance(n, ast.Eq) for n in nodes)
assignments = sum(isinstance(n, ast.Assign) for n in nodes)
If you don't really care about the efficiency of algorithm, this is a fairly simple solution:
file = open("asd.txt")
total_double_eq_count = 0
total_single_eq_count = 0
#iterate over the lines of file
for line in file:
#count of '=='s in the line
double_eq_count = line.count("==")
#count of '='s which are not followed by an another '='.
single_eq_count = line.count("=") - 2*double_eq_count
total_double_eq_count += double_eq_count
total_single_eq_count += single_eq_count
print(total_double_eq_count)
print(total_single_eq_count)
But this is relatively fast compared to a equivalent python code since we are using builtin methods for string processing. At least on small inputs.

Python: Read last 'n' lines from a file [duplicate]

I'm writing a log file viewer for a web application and for that I want to paginate through the lines of the log file. The items in the file are line based with the newest item at the bottom.
So I need a tail() method that can read n lines from the bottom and support an offset. This is hat I came up with:
def tail(f, n, offset=0):
"""Reads a n lines from f with an offset of offset lines."""
avg_line_length = 74
to_read = n + offset
while 1:
try:
f.seek(-(avg_line_length * to_read), 2)
except IOError:
# woops. apparently file is smaller than what we want
# to step back, go to the beginning instead
f.seek(0)
pos = f.tell()
lines = f.read().splitlines()
if len(lines) >= to_read or pos == 0:
return lines[-to_read:offset and -offset or None]
avg_line_length *= 1.3
Is this a reasonable approach? What is the recommended way to tail log files with offsets?
This may be quicker than yours. Makes no assumptions about line length. Backs through the file one block at a time till it's found the right number of '\n' characters.
def tail( f, lines=20 ):
total_lines_wanted = lines
BLOCK_SIZE = 1024
f.seek(0, 2)
block_end_byte = f.tell()
lines_to_go = total_lines_wanted
block_number = -1
blocks = [] # blocks of size BLOCK_SIZE, in reverse order starting
# from the end of the file
while lines_to_go > 0 and block_end_byte > 0:
if (block_end_byte - BLOCK_SIZE > 0):
# read the last block we haven't yet read
f.seek(block_number*BLOCK_SIZE, 2)
blocks.append(f.read(BLOCK_SIZE))
else:
# file too small, start from begining
f.seek(0,0)
# only read what was not read
blocks.append(f.read(block_end_byte))
lines_found = blocks[-1].count('\n')
lines_to_go -= lines_found
block_end_byte -= BLOCK_SIZE
block_number -= 1
all_read_text = ''.join(reversed(blocks))
return '\n'.join(all_read_text.splitlines()[-total_lines_wanted:])
I don't like tricky assumptions about line length when -- as a practical matter -- you can never know things like that.
Generally, this will locate the last 20 lines on the first or second pass through the loop. If your 74 character thing is actually accurate, you make the block size 2048 and you'll tail 20 lines almost immediately.
Also, I don't burn a lot of brain calories trying to finesse alignment with physical OS blocks. Using these high-level I/O packages, I doubt you'll see any performance consequence of trying to align on OS block boundaries. If you use lower-level I/O, then you might see a speedup.
UPDATE
for Python 3.2 and up, follow the process on bytes as In text files (those opened without a "b" in the mode string), only seeks relative to the beginning of the file are allowed (the exception being seeking to the very file end with seek(0, 2)).:
eg: f = open('C:/.../../apache_logs.txt', 'rb')
def tail(f, lines=20):
total_lines_wanted = lines
BLOCK_SIZE = 1024
f.seek(0, 2)
block_end_byte = f.tell()
lines_to_go = total_lines_wanted
block_number = -1
blocks = []
while lines_to_go > 0 and block_end_byte > 0:
if (block_end_byte - BLOCK_SIZE > 0):
f.seek(block_number*BLOCK_SIZE, 2)
blocks.append(f.read(BLOCK_SIZE))
else:
f.seek(0,0)
blocks.append(f.read(block_end_byte))
lines_found = blocks[-1].count(b'\n')
lines_to_go -= lines_found
block_end_byte -= BLOCK_SIZE
block_number -= 1
all_read_text = b''.join(reversed(blocks))
return b'\n'.join(all_read_text.splitlines()[-total_lines_wanted:])
Assumes a unix-like system on Python 2 you can do:
import os
def tail(f, n, offset=0):
stdin,stdout = os.popen2("tail -n "+n+offset+" "+f)
stdin.close()
lines = stdout.readlines(); stdout.close()
return lines[:,-offset]
For python 3 you may do:
import subprocess
def tail(f, n, offset=0):
proc = subprocess.Popen(['tail', '-n', n + offset, f], stdout=subprocess.PIPE)
lines = proc.stdout.readlines()
return lines[:, -offset]
Here is my answer. Pure python. Using timeit it seems pretty fast. Tailing 100 lines of a log file that has 100,000 lines:
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=10)
0.0014600753784179688
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=100)
0.00899195671081543
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=1000)
0.05842900276184082
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=10000)
0.5394978523254395
>>> timeit.timeit('tail.tail(f, 100, 4098)', 'import tail; f = open("log.txt", "r");', number=100000)
5.377126932144165
Here is the code:
import os
def tail(f, lines=1, _buffer=4098):
"""Tail a file and get X lines from the end"""
# place holder for the lines found
lines_found = []
# block counter will be multiplied by buffer
# to get the block size from the end
block_counter = -1
# loop until we find X lines
while len(lines_found) < lines:
try:
f.seek(block_counter * _buffer, os.SEEK_END)
except IOError: # either file is too small, or too many lines requested
f.seek(0)
lines_found = f.readlines()
break
lines_found = f.readlines()
# we found enough lines, get out
# Removed this line because it was redundant the while will catch
# it, I left it for history
# if len(lines_found) > lines:
# break
# decrement the block counter to get the
# next X bytes
block_counter -= 1
return lines_found[-lines:]
If reading the whole file is acceptable then use a deque.
from collections import deque
deque(f, maxlen=n)
Prior to 2.6, deques didn't have a maxlen option, but it's easy enough to implement.
import itertools
def maxque(items, size):
items = iter(items)
q = deque(itertools.islice(items, size))
for item in items:
del q[0]
q.append(item)
return q
If it's a requirement to read the file from the end, then use a gallop (a.k.a exponential) search.
def tail(f, n):
assert n >= 0
pos, lines = n+1, []
while len(lines) <= n:
try:
f.seek(-pos, 2)
except IOError:
f.seek(0)
break
finally:
lines = list(f)
pos *= 2
return lines[-n:]
S.Lott's answer above almost works for me but ends up giving me partial lines. It turns out that it corrupts data on block boundaries because data holds the read blocks in reversed order. When ''.join(data) is called, the blocks are in the wrong order. This fixes that.
def tail(f, window=20):
"""
Returns the last `window` lines of file `f` as a list.
f - a byte file-like object
"""
if window == 0:
return []
BUFSIZ = 1024
f.seek(0, 2)
bytes = f.tell()
size = window + 1
block = -1
data = []
while size > 0 and bytes > 0:
if bytes - BUFSIZ > 0:
# Seek back one whole BUFSIZ
f.seek(block * BUFSIZ, 2)
# read BUFFER
data.insert(0, f.read(BUFSIZ))
else:
# file too small, start from begining
f.seek(0,0)
# only read what was not read
data.insert(0, f.read(bytes))
linesFound = data[0].count('\n')
size -= linesFound
bytes -= BUFSIZ
block -= 1
return ''.join(data).splitlines()[-window:]
The code I ended up using. I think this is the best so far:
def tail(f, n, offset=None):
"""Reads a n lines from f with an offset of offset lines. The return
value is a tuple in the form ``(lines, has_more)`` where `has_more` is
an indicator that is `True` if there are more lines in the file.
"""
avg_line_length = 74
to_read = n + (offset or 0)
while 1:
try:
f.seek(-(avg_line_length * to_read), 2)
except IOError:
# woops. apparently file is smaller than what we want
# to step back, go to the beginning instead
f.seek(0)
pos = f.tell()
lines = f.read().splitlines()
if len(lines) >= to_read or pos == 0:
return lines[-to_read:offset and -offset or None], \
len(lines) > to_read or pos > 0
avg_line_length *= 1.3
Simple and fast solution with mmap:
import mmap
import os
def tail(filename, n):
"""Returns last n lines from the filename. No exception handling"""
size = os.path.getsize(filename)
with open(filename, "rb") as f:
# for Windows the mmap parameters are different
fm = mmap.mmap(f.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ)
try:
for i in xrange(size - 1, -1, -1):
if fm[i] == '\n':
n -= 1
if n == -1:
break
return fm[i + 1 if i else 0:].splitlines()
finally:
fm.close()
Update #papercrane solution to python3.
Open the file with open(filename, 'rb') and:
def tail(f, window=20):
"""Returns the last `window` lines of file `f` as a list.
"""
if window == 0:
return []
BUFSIZ = 1024
f.seek(0, 2)
remaining_bytes = f.tell()
size = window + 1
block = -1
data = []
while size > 0 and remaining_bytes > 0:
if remaining_bytes - BUFSIZ > 0:
# Seek back one whole BUFSIZ
f.seek(block * BUFSIZ, 2)
# read BUFFER
bunch = f.read(BUFSIZ)
else:
# file too small, start from beginning
f.seek(0, 0)
# only read what was not read
bunch = f.read(remaining_bytes)
bunch = bunch.decode('utf-8')
data.insert(0, bunch)
size -= bunch.count('\n')
remaining_bytes -= BUFSIZ
block -= 1
return ''.join(data).splitlines()[-window:]
The simplest way is to use deque:
from collections import deque
def tail(filename, n=10):
with open(filename) as f:
return deque(f, n)
Posting an answer at the behest of commenters on my answer to a similar question where the same technique was used to mutate the last line of a file, not just get it.
For a file of significant size, mmap is the best way to do this. To improve on the existing mmap answer, this version is portable between Windows and Linux, and should run faster (though it won't work without some modifications on 32 bit Python with files in the GB range, see the other answer for hints on handling this, and for modifying to work on Python 2).
import io # Gets consistent version of open for both Py2.7 and Py3.x
import itertools
import mmap
def skip_back_lines(mm, numlines, startidx):
'''Factored out to simplify handling of n and offset'''
for _ in itertools.repeat(None, numlines):
startidx = mm.rfind(b'\n', 0, startidx)
if startidx < 0:
break
return startidx
def tail(f, n, offset=0):
# Reopen file in binary mode
with io.open(f.name, 'rb') as binf, mmap.mmap(binf.fileno(), 0, access=mmap.ACCESS_READ) as mm:
# len(mm) - 1 handles files ending w/newline by getting the prior line
startofline = skip_back_lines(mm, offset, len(mm) - 1)
if startofline < 0:
return [] # Offset lines consumed whole file, nothing to return
# If using a generator function (yield-ing, see below),
# this should be a plain return, no empty list
endoflines = startofline + 1 # Slice end to omit offset lines
# Find start of lines to capture (add 1 to move from newline to beginning of following line)
startofline = skip_back_lines(mm, n, startofline) + 1
# Passing True to splitlines makes it return the list of lines without
# removing the trailing newline (if any), so list mimics f.readlines()
return mm[startofline:endoflines].splitlines(True)
# If Windows style \r\n newlines need to be normalized to \n, and input
# is ASCII compatible, can normalize newlines with:
# return mm[startofline:endoflines].replace(os.linesep.encode('ascii'), b'\n').splitlines(True)
This assumes the number of lines tailed is small enough you can safely read them all into memory at once; you could also make this a generator function and manually read a line at a time by replacing the final line with:
mm.seek(startofline)
# Call mm.readline n times, or until EOF, whichever comes first
# Python 3.2 and earlier:
for line in itertools.islice(iter(mm.readline, b''), n):
yield line
# 3.3+:
yield from itertools.islice(iter(mm.readline, b''), n)
Lastly, this read in binary mode (necessary to use mmap) so it gives str lines (Py2) and bytes lines (Py3); if you want unicode (Py2) or str (Py3), the iterative approach could be tweaked to decode for you and/or fix newlines:
lines = itertools.islice(iter(mm.readline, b''), n)
if f.encoding: # Decode if the passed file was opened with a specific encoding
lines = (line.decode(f.encoding) for line in lines)
if 'b' not in f.mode: # Fix line breaks if passed file opened in text mode
lines = (line.replace(os.linesep, '\n') for line in lines)
# Python 3.2 and earlier:
for line in lines:
yield line
# 3.3+:
yield from lines
Note: I typed this all up on a machine where I lack access to Python to test. Please let me know if I typoed anything; this was similar enough to my other answer that I think it should work, but the tweaks (e.g. handling an offset) could lead to subtle errors. Please let me know in the comments if there are any mistakes.
An even cleaner python3 compatible version that doesn't insert but appends & reverses:
def tail(f, window=1):
"""
Returns the last `window` lines of file `f` as a list of bytes.
"""
if window == 0:
return b''
BUFSIZE = 1024
f.seek(0, 2)
end = f.tell()
nlines = window + 1
data = []
while nlines > 0 and end > 0:
i = max(0, end - BUFSIZE)
nread = min(end, BUFSIZE)
f.seek(i)
chunk = f.read(nread)
data.append(chunk)
nlines -= chunk.count(b'\n')
end -= nread
return b'\n'.join(b''.join(reversed(data)).splitlines()[-window:])
use it like this:
with open(path, 'rb') as f:
last_lines = tail(f, 3).decode('utf-8')
Simple :
with open("test.txt") as f:
data = f.readlines()
tail = data[-2:]
print(''.join(tail)
based on S.Lott's top voted answer (Sep 25 '08 at 21:43), but fixed for small files.
def tail(the_file, lines_2find=20):
the_file.seek(0, 2) #go to end of file
bytes_in_file = the_file.tell()
lines_found, total_bytes_scanned = 0, 0
while lines_2find+1 > lines_found and bytes_in_file > total_bytes_scanned:
byte_block = min(1024, bytes_in_file-total_bytes_scanned)
the_file.seek(-(byte_block+total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found += the_file.read(1024).count('\n')
the_file.seek(-total_bytes_scanned, 2)
line_list = list(the_file.readlines())
return line_list[-lines_2find:]
#we read at least 21 line breaks from the bottom, block by block for speed
#21 to ensure we don't get a half line
Hope this is useful.
There are some existing implementations of tail on pypi which you can install using pip:
mtFileUtil
multitail
log4tailer
...
Depending on your situation, there may be advantages to using one of these existing tools.
I found the Popen above to be the best solution. It's quick and dirty and it works
For python 2.6 on Unix machine i used the following
def GetLastNLines(self, n, fileName):
"""
Name: Get LastNLines
Description: Gets last n lines using Unix tail
Output: returns last n lines of a file
Keyword argument:
n -- number of last lines to return
filename -- Name of the file you need to tail into
"""
p = subprocess.Popen(['tail','-n',str(n),self.__fileName], stdout=subprocess.PIPE)
soutput, sinput = p.communicate()
return soutput
soutput will have will contain last n lines of the code. to iterate through soutput line by line do:
for line in GetLastNLines(50,'myfile.log').split('\n'):
print line
For efficiency with very large files (common in logfile situations where you may want to use tail), you generally want to avoid reading the whole file (even if you do do it without reading the whole file into memory at once) However, you do need to somehow work out the offset in lines rather than characters. One possibility is reading backwards with seek() char by char, but this is very slow. Instead, its better to process in larger blocks.
I've a utility function I wrote a while ago to read files backwards that can be used here.
import os, itertools
def rblocks(f, blocksize=4096):
"""Read file as series of blocks from end of file to start.
The data itself is in normal order, only the order of the blocks is reversed.
ie. "hello world" -> ["ld","wor", "lo ", "hel"]
Note that the file must be opened in binary mode.
"""
if 'b' not in f.mode.lower():
raise Exception("File must be opened using binary mode.")
size = os.stat(f.name).st_size
fullblocks, lastblock = divmod(size, blocksize)
# The first(end of file) block will be short, since this leaves
# the rest aligned on a blocksize boundary. This may be more
# efficient than having the last (first in file) block be short
f.seek(-lastblock,2)
yield f.read(lastblock)
for i in range(fullblocks-1,-1, -1):
f.seek(i * blocksize)
yield f.read(blocksize)
def tail(f, nlines):
buf = ''
result = []
for block in rblocks(f):
buf = block + buf
lines = buf.splitlines()
# Return all lines except the first (since may be partial)
if lines:
result.extend(lines[1:]) # First line may not be complete
if(len(result) >= nlines):
return result[-nlines:]
buf = lines[0]
return ([buf]+result)[-nlines:]
f=open('file_to_tail.txt','rb')
for line in tail(f, 20):
print line
[Edit] Added more specific version (avoids need to reverse twice)
you can go to the end of your file with f.seek(0, 2) and then read off lines one by one with the following replacement for readline():
def readline_backwards(self, f):
backline = ''
last = ''
while not last == '\n':
backline = last + backline
if f.tell() <= 0:
return backline
f.seek(-1, 1)
last = f.read(1)
f.seek(-1, 1)
backline = last
last = ''
while not last == '\n':
backline = last + backline
if f.tell() <= 0:
return backline
f.seek(-1, 1)
last = f.read(1)
f.seek(-1, 1)
f.seek(1, 1)
return backline
Based on Eyecue answer (Jun 10 '10 at 21:28): this class add head() and tail() method to file object.
class File(file):
def head(self, lines_2find=1):
self.seek(0) #Rewind file
return [self.next() for x in xrange(lines_2find)]
def tail(self, lines_2find=1):
self.seek(0, 2) #go to end of file
bytes_in_file = self.tell()
lines_found, total_bytes_scanned = 0, 0
while (lines_2find+1 > lines_found and
bytes_in_file > total_bytes_scanned):
byte_block = min(1024, bytes_in_file-total_bytes_scanned)
self.seek(-(byte_block+total_bytes_scanned), 2)
total_bytes_scanned += byte_block
lines_found += self.read(1024).count('\n')
self.seek(-total_bytes_scanned, 2)
line_list = list(self.readlines())
return line_list[-lines_2find:]
Usage:
f = File('path/to/file', 'r')
f.head(3)
f.tail(3)
Several of these solutions have issues if the file doesn't end in \n or in ensuring the complete first line is read.
def tail(file, n=1, bs=1024):
f = open(file)
f.seek(-1,2)
l = 1-f.read(1).count('\n') # If file doesn't end in \n, count it anyway.
B = f.tell()
while n >= l and B > 0:
block = min(bs, B)
B -= block
f.seek(B, 0)
l += f.read(block).count('\n')
f.seek(B, 0)
l = min(l,n) # discard first (incomplete) line if l > n
lines = f.readlines()[-l:]
f.close()
return lines
Here is a pretty simple implementation:
with open('/etc/passwd', 'r') as f:
try:
f.seek(0,2)
s = ''
while s.count('\n') < 11:
cur = f.tell()
f.seek((cur - 10))
s = f.read(10) + s
f.seek((cur - 10))
print s
except Exception as e:
f.readlines()
There is very useful module that can do this:
from file_read_backwards import FileReadBackwards
with FileReadBackwards("/tmp/file", encoding="utf-8") as frb:
# getting lines by lines starting from the last line up
for l in frb:
print(l)
Update for answer given by A.Coady
Works with python 3.
This uses Exponential Search and will buffer only N lines from back and is very efficient.
import time
import os
import sys
def tail(f, n):
assert n >= 0
pos, lines = n+1, []
# set file pointer to end
f.seek(0, os.SEEK_END)
isFileSmall = False
while len(lines) <= n:
try:
f.seek(f.tell() - pos, os.SEEK_SET)
except ValueError as e:
# lines greater than file seeking size
# seek to start
f.seek(0,os.SEEK_SET)
isFileSmall = True
except IOError:
print("Some problem reading/seeking the file")
sys.exit(-1)
finally:
lines = f.readlines()
if isFileSmall:
break
pos *= 2
print(lines)
return lines[-n:]
with open("stream_logs.txt") as f:
while(True):
time.sleep(0.5)
print(tail(f,2))
I had to read a specific value from the last line of a file, and stumbled upon this thread. Rather than reinventing the wheel in Python, I ended up with a tiny shell script, saved as
/usr/local/bin/get_last_netp:
#! /bin/bash
tail -n1 /home/leif/projects/transfer/export.log | awk {'print $14'}
And in the Python program:
from subprocess import check_output
last_netp = int(check_output("/usr/local/bin/get_last_netp"))
Not the first example using a deque, but a simpler one. This one is general: it works on any iterable object, not just a file.
#!/usr/bin/env python
import sys
import collections
def tail(iterable, N):
deq = collections.deque()
for thing in iterable:
if len(deq) >= N:
deq.popleft()
deq.append(thing)
for thing in deq:
yield thing
if __name__ == '__main__':
for line in tail(sys.stdin,10):
sys.stdout.write(line)
This is my version of tailf
import sys, time, os
filename = 'path to file'
try:
with open(filename) as f:
size = os.path.getsize(filename)
if size < 1024:
s = size
else:
s = 999
f.seek(-s, 2)
l = f.read()
print l
while True:
line = f.readline()
if not line:
time.sleep(1)
continue
print line
except IOError:
pass
import time
attemps = 600
wait_sec = 5
fname = "YOUR_PATH"
with open(fname, "r") as f:
where = f.tell()
for i in range(attemps):
line = f.readline()
if not line:
time.sleep(wait_sec)
f.seek(where)
else:
print line, # already has newline
import itertools
fname = 'log.txt'
offset = 5
n = 10
with open(fname) as f:
n_last_lines = list(reversed([x for x in itertools.islice(f, None)][-(offset+1):-(offset+n+1):-1]))
abc = "2018-06-16 04:45:18.68"
filename = "abc.txt"
with open(filename) as myFile:
for num, line in enumerate(myFile, 1):
if abc in line:
lastline = num
print "last occurance of work at file is in "+str(lastline)
Another Solution
if your txt file looks like this:
mouse
snake
cat
lizard
wolf
dog
you could reverse this file by simply using array indexing in python
'''
contents=[]
def tail(contents,n):
with open('file.txt') as file:
for i in file.readlines():
contents.append(i)
for i in contents[:n:-1]:
print(i)
tail(contents,-5)
result:
dog
wolf
lizard
cat
Well! I had a similar problem, though I only required LAST LINE ONLY,
so I came up with my own solution
def get_last_line(filepath):
try:
with open(filepath,'rb') as f:
f.seek(-1,os.SEEK_END)
text = [f.read(1)]
while text[-1] != '\n'.encode('utf-8') or len(text)==1:
f.seek(-2, os.SEEK_CUR)
text.append(f.read(1))
except Exception as e:
pass
return ''.join([t.decode('utf-8') for t in text[::-1]]).strip()
This function return last string in a file
I have a log file of 1.27gb and it took very very less time to find the last line (not even half a second)

Hex Coded Decimal in Python

I need to make a function that accepts an integer and returns a binary string of that integer encoded as Hex Coded Decimal, for later packing into a struct.
for example, I have written this:
def convert_int(x):
"""
Accepts an integer, outputs a hexadecimal string in HCD format
Caution! Byte order is ALREADY little endian!
"""
result = b''
while x > 0:
hcd = chr(int(str(divmod(x, 100)[1]), 16))
result = result + hcd
x = divmod(x, 100)[0]
return result
so convert_int(1234) would be 3412h and so on. What is the most Pythonic and elegant way of writing this?
upd: made the function output little endian strings ready for packing.
def convert_to_hcd(num):
chars = []
while num:
num, ones = divmod(num, 10)
num, tens = divmod(num, 10)
chars.append(chr(tens * 16 + ones))
chars.reverse()
return "".join(chars)
convert_to_hcd(1234) # => returns '\x124' (which is correct because '\x34' == '4')
So, the correct code for me is the following, note that the byte order is reversed (little endian):
def convert_int(x):
"""
Accepts an integer, outputs a hexadecimal string in HCD format
Caution! Byte order is ALREADY little endian!
"""
result = b''
while x > 0:
hcd = chr(int(str(divmod(x, 100)[1]), 16))
result = result + hcd
x = divmod(x, 100)[0]
return result
What about this little snippet... :)
def convert_to_hcd(num): return "".join([ "\\x"+ (lambda i, k: ""+i+k if i !='m' else "0"+k)(i,k) for i,k in zip((numif not len(num) % 2 else 'm' +num) [0::2], (numif not len(num) % 2 else 'm' +num[1::2])])
def intToHex(num):
numBin = [int(n) for n in str(num)]
result = 0
for n in numBin:
result = result*16 + n
return(hex(result))
this is clean
>>> hex(1234)
'0x4d2'
Google isn't your enemy...

Encoding a 128-bit integer in Python?

Inspired by the "encoding scheme" of the answer to this question, I implemented my own encoding algorithm in Python.
Here is what it looks like:
import random
from math import pow
from string import ascii_letters, digits
# RFC 2396 unreserved URI characters
unreserved = '-_.!~*\'()'
characters = ascii_letters + digits + unreserved
size = len(characters)
seq = range(0,size)
# Seed random generator with same randomly generated number
random.seed(914576904)
random.shuffle(seq)
dictionary = dict(zip(seq, characters))
reverse_dictionary = dict((v,k) for k,v in dictionary.iteritems())
def encode(n):
d = []
n = n
while n > 0:
qr = divmod(n, size)
n = qr[0]
d.append(qr[1])
chars = ''
for i in d:
chars += dictionary[i]
return chars
def decode(str):
d = []
for c in str:
d.append(reverse_dictionary[c])
value = 0
for i in range(0, len(d)):
value += d[i] * pow(size, i)
return value
The issue I'm running into is encoding and decoding very large integers. For example, this is how a large number is currently encoded and decoded:
s = encode(88291326719355847026813766449910520462)
# print s -> "3_r(AUqqMvPRkf~JXaWj8"
i = decode(s)
# print i -> "8.82913267194e+37"
# print long(i) -> "88291326719355843047833376688611262464"
The highest 16 places match up perfectly, but after those the number deviates from its original.
I assume this is a problem with the precision of extremely large integers when dividing in Python. Is there any way to circumvent this problem? Or is there another issue that I'm not aware of?
The problem lies within this line:
value += d[i] * pow(size, i)
It seems like you're using math.pow here instead of the built-in pow method. It returns a floating point number, so you lose accuracy for your large numbers. You should use the built-in pow or the ** operator or, even better, keep the current power of the base in an integer variable:
def decode(s):
d = [reverse_dictionary[c] for c in s]
result, power = 0, 1
for x in d:
result += x * power
power *= size
return result
It gives me the following result now:
print decode(encode(88291326719355847026813766449910520462))
# => 88291326719355847026813766449910520462

Categories

Resources