logical "OR" between bin files - python

I'm trying to write a script that take a list of files and perform a "logical or" between them. As you can see in the script, at the first stage i'm creating an empty append_buffer. Then I want to do a logical OR with all the files in the list.
My problem is that when I read the files I get a str and not a bytearray. So when I tried to perform the or it failed. I have tried to convert it without any success.
import struct
#import sys, ast
buffera = bytearray()
append_buffer = bytearray()
output_buffer = bytearray()
files_list=['E:\out.jpg','E:\loala2.jpg','E:\Koala.jpg','E:\loala2.jpg']
print(files_list[1])
#######################################################################################################################
# create_dummy_bin_file_for_first_iteration , base on first file size
temp_file = open(files_list[1], "rb")
print ( temp_file )
buffera = temp_file.read(temp_file.__sizeof__())
temp_file.close()
for x in range(0, len(buffera)):
append_buffer.append(0x00)
#######################################################################################################################
for i in range(1, len(files_list)):
print( files_list[i] )
file = open(files_list[i], "rb")
file_buffer = file.read(file.__sizeof__())
file.close()
if ( len(file_buffer) != len(append_buffer) ):
print("Can't merge different size bin files ")
exit(1)
else:
for x in range(0, len(buffera)):
or_data=(file_buffer[x] | append_buffer[x])
print("---")
print(type(file_buffer[x]))
print(file_buffer[x])
print("---")
print(type(append_buffer[x]))
print(append_buffer[x])
outputfile = open(files_list[0], "wb")
outputfile.write(output_buffer)
outputfile.close()

You can use the ord and chr operators to do convert each character to an integer and back.
Using this, your code would be:
or_data=chr(ord(file_buffer[x]) | ord(append_buffer[x]))

This code example makes the complete work in memory.
# Read data from the first file
with open("file1.txt", "rt") as f:
d1 = f.read()
# Read data from the second file
with open("file2.txt", "rt") as f:
d2 = f.read()
# Make sure that both sizes are equal
assert len(d1) == len(d2)
# Calculate OR-ed data
d3 = "".join(chr(ord(d1[i]) | ord(d2[i])) for i in range(len(d1)))
# Write the output data
with open("file3.txt", "wt") as f:
f.write(d3)
It's possible to process these data byte-by-byte too, in order to reduce memory consumption.

Related

How to write an array to a file and then call that file and add more to the array?

So as the title suggests I'm trying to write an array to a file, but then I need to recall that array and append more to it and then write it back to the same file, and then this same process over and over again.
The code I'm have so far is:
c = open(r"board.txt", "r")
current_position = []
if filesize > 4:
current_position = [c.read()]
print(current_position)
stockfish.set_position(current_position)
else:
stockfish.set_fen_position("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1")
#There is a lot more code here that appends stuff to the array but I don't want to #add anything that will be irrelevant to the problem
with open('board.txt', 'w') as filehandle:
for listitem in current_position:
filehandle.write('"%s", ' % listitem)
z = open(r"board.txt", "r")
print(z.read())
My array end up looking like this when I read the file
""d2d4", "d7d5", ", "a2a4", "e2e4",
All my code is on this replit if anyone needs more info
A few ways to do this:
First, use newline as a delimiter (simple, not the most space efficient):
# write
my_array = ['d2d4', 'd7d5']
with open('board.txt', 'w+') as f:
f.writelines([i + '\n' for i in my_array])
# read
with open('board.txt') as f:
my_array = f.read().splitlines()
If your character strings all have the same length, you don't need a delimiter:
# write
my_array = ['d2d4', 'd7d5'] # must all be length 4 strs
with open('board.txt', 'w+') as f:
f.writelines(my_array)
# read file, splitting string into groups of 4 characters
with open('board.txt') as f:
in_str = f.read()
my_array = [in_str[i:i+4] for i in range(0, len(in_str), 4)]
Finally, consider pickle, which allows writing/reading Python objects to/from binary files:
import pickle
# write
my_array = ['d2d4', 'd7d5']
with open('board.board', 'wb+') as f: # custom file extension, can be anything
pickle.dump(my_array, f)
# read
with open('board.board', 'rb') as f:
my_array = pickle.load(f)
as you're reusing the file to append data to it, you should replace:
open('board.txt', 'w')
with
open('board.txt', 'a')
a denotes append mode. Which will not overwrite what you have in your file, it will append to it.

Iterate hexadecimals to create jpeg files

I have a textfile with lines of hexadecimals. I want each line to transform into a jpeg.file because they are photos. I can do that individually using binascii.a2b_hex like this (I've shortened the hex):
data = binascii.a2b_hex("FFD8FFE")
with open('image.jpg', 'wb') as image_file:
#image_file.write(data)
Now I want to this in bulk. So I have a textfile with lines of hexadecimals and I want each hexadecimal to write to his own jpeg file. I think I'm almost there but my code gives me this error:
ValueError: too many values to unpack
Here is the code:
import binascii
text_file = open("photos-clean.txt", "w")
#for each hexadecimal, put it in between single quotes so it becomes a string. Also remove the first two chars from a line.
with open('photos.txt', 'r') as f:
for i in f:
photo = i[2:]
quotes = "'" + photo.rstrip() + "'"
print quotes
text_file.write(quotes)
text_file.write("\n")
text_file.close()
#for each hexadecimal, transform it to a jpeg with binascii and write it to his own jpeg.file
with open("photos-clean.txt", "r") as f2:
for i, data in (f2):
transform = binascii.a2b_hex(i)
with open('photo{}.jpg'.format(transform), 'wb') as output:
output.write(data)
Edit: I have the answer and this is what I should've done:
import binascii
text_file = open("photos-clean.txt", "w")
with open('photos.txt', 'r') as f:
for i in f:
photo = i[2:]
text_file.write(photo)
text_file.write("\n")
text_file.close()
with open("photos-clean.txt", "r") as f2:
count=0
for i in f2:
count = count + 1
cleaned = i.strip("\r\n")
transform = binascii.a2b_hex(cleaned)
with open("{}.jpg".format(count), 'wb') as output:
output.write(transform)
i guess you are having error at line for i, data in (f2):.
You try to unpack 2 value i and data from f2, i guess you need enumerate as below.
Also, i assume you wanna use i as index to the filename and write the transform instead of data into output
with open("photos-clean.txt", "r") as f2:
for i, data in enumerate(f2):
transform = binascii.a2b_hex(data)
with open('photo{}.jpg'.format(i), 'wb') as output:
output.write(transfrom)

Faster way to remove duplicates from a very large text file in Python?

I have a very large text file with duplicate entries which I want to eliminate. I do not care about the order of the entries because the file will later be sorted.
Here is what I have so far:
unique_lines = set()
outfile = open("UniqueMasterList.txt", "w", encoding = "latin-1")
with open("MasterList.txt", "r", encoding = "latin-1") as infile:
for line in infile:
if line not in unique_lines:
outfile.write(line)
unique_lines.add(line)
outfile.close()
It has been running for 30 minutes and has not finished. I need it to be faster. What is a faster approach in Python?
Look for the corresponding system command. In Linux/UNIX, you would use
uniq MasterList.txt > UniqueMasterList.txt
The OS generally knows the best way to do these things.
post-comment edit
#Mark Ransom reminded me that uniq depends on matching lines being contiguous in the file. The simplest way to achieve this is to sort the file:
sort MasterList.txt | uniq > UniqueMasterList.txt
To use the same technique as uniq, in Python:
import itertools
with open("MasterList.txt", "r", encoding = "latin-1") as infile:
sorted_file = sorted(infile.readlines())
for line, _ in itertools.groupby(sorted_file):
outfile.write(line)
This presumes that the entire file will fit into memory, twice. Or that the file is already sorted and you can skip that step.
The simple approach that i would suggest is use of hashing and hash tables.You can hash each line using a efficient hash function and then insert it into a hash table and output contents where count is 1.Similar to solving word /letter count problem using hash tables.For look up it would only cost o(1) and usage of memory can be restricted to a constant amount depending on the size of hash table used.
SPLIT_COUNT = 30
def write_data(t_file, value):
t_file.write(value)
def calculate_hash(filename, handle_file):
with open(filename, 'r') as f:
for line in f:
write_data(handle_file[hash(line)%SPLIT_COUNT], line)
def generate_file(dir):
handle_file, files = [], []
for i in range(SPLIT_COUNT):
path = dir+"split_"+str(i)
files.append(path)
f = open(path, 'w')
handle_file.append(f)
return files, handle_file
def close_file(handle_file):
for i in range(len(handle_file)):
handle_file[i].close()
def data_uniq(files, new_file):
dataset = dict()
n_file = open(new_file, 'w')
for filename in files:
f = open(filename, 'r')
for line in f:
dataset[line] = 1
f.close()
for key in dataset.keys():
n_file.write(key)
dataset = {}
n_file.close()
if __name__ == "__main__":
filename = './clean.txt'
generate_dir = './tmp/'
new_file = './out.txt'
files, handle_file = generate_file(generate_dir)
calculate_hash(filename, handle_file)
close_file(handle_file)
data_uniq(files, new_file)

xor-ing a large file in python

I am trying to apply a xOr operation to a number of files, some of which are very large.
Basically i am getting a file and xor-ing it byte by byte (or at least this is what i think i'm doing). When it hits a larger file (around 70MB) i get an out of memory error and my script crashes.
My computer has 16GB of Ram with more than 50% of it available so i would not relate this to my hardware.
def xor3(source_file, target_file):
b = bytearray(open(source_file, 'rb').read())
for i in range(len(b)):
b[i] ^= 0x71
open(target_file, 'wb').write(b)
I tried to read the file in chunks, but it seems i'm too unexperimented for this as the output is not the desired one. The first function returns what i want, of course :)
def xor(data):
b = bytearray(data)
for i in range(len(b)):
b[i] ^= 0x41
return data
def xor4(source_file, target_file):
with open(source_file,'rb') as ifile:
with open(target_file, 'w+b') as ofile:
data = ifile.read(1024*1024)
while data:
ofile.write(xor(data))
data = ifile.read(1024*1024)
What is the appropiate solution for this kind of operation ? What is it that i am doing wrong ?
use seek function to get the file in chunks and append it every time to output file
CHUNK_SIZE = 1000 #for example
with open(source_file, 'rb') as source:
with open(target_file, 'a') as target:
bytes = bytearray(source.read(CHUNK_SIZE))
source.seek(CHUNK_SIZE)
for i in range(len(bytes)):
bytes[i] ^= 0x71
target.write(bytes)
Unless I am mistaken, in your second example, you create a copy of data by calling bytearray and assigning it to b. Then you modify b, but return data.
The modification in b has no effect on data itself.
Iterate lazily over the large file.
from operator import xor
from functools import partial
def chunked(file, chunk_size):
return iter(lambda: file.read(chunk_size), b'')
myoperation = partial(xor, 0x71)
with open(source_file, 'rb') as source, open(target_file, 'ab') as target:
processed = (map(myoperation, bytearray(data)) for data in chunked(source, 65536))
for data in processed:
target.write(bytearray(data))
This probably only works in python 2, which shows again that it's much nicer to use for bytestreams:
def xor(infile, outfile, val=0x71, chunk=1024):
with open(infile, 'r') as inf:
with open(outfile, 'w') as outf:
c = inf.read(chunk)
while c != '':
s = "".join([chr(ord(cc) ^val) for cc in c])
outf.write(s)
c = inf.read(chunk)

Python- Read from Multiple Files

I have 125 data files containing two columns and 21 rows of data. Please see the image below:
and I'd like to import them into a single .csv file (as 250 columns and 21 rows).
I am fairly new to python but this what I have been advised, code wise:
import glob
Results = [open(f) for f in glob.glob("*.data")]
fout = open("res.csv", 'w')
for row in range(21):
for f in Results:
fout.write( f.readline().strip() )
fout.write(',')
fout.write('\n')
fout.close()
However, there is slight problem with the code as I only get 125 columns, (i.e. the force and displacement columns are written in one column) Please refer to the image below:
I'd very much appreciate it if anyone could help me with this !
import glob
results = [open(f) for f in glob.glob("*.data")]
sep = ","
# Uncomment if your Excel formats decimal numbers like 3,14 instead of 3.14
# sep = ";"
with open("res.csv", 'w') as fout:
for row in range(21):
iterator = (f.readline().strip().replace("\t", sep) for f in results)
line = sep.join(iterator)
fout.write("{0}\n".format(line))
So to explain what went wrong with your code, your source files use tab as a field separator, but your code uses comma to separate the lines it reads from those files. If your excel uses period as a decimal separator, it uses comma as a default field separator. The whitespace is ignored unless enclosed in quotes, and you see the result.
If you use the text import feature of Excel (Data ribbon => From Text) you can ask it to consider both comma and tab as valid field separators, and then I'm pretty sure your original output would work too.
In contrast, the above code should produce a file that will open correctly when double clicked.
You don't need to write your own program to do this, in python or otherwise. You can use an existing unix command (if you are in that environment):
paste *.data > res.csv
Try this:
import glob, csv
from itertools import cycle, islice, count
def roundrobin(*iterables):
"roundrobin('ABC', 'D', 'EF') --> A D E B F C"
# Recipe credited to George Sakkis
pending = len(iterables)
nexts = cycle(iter(it).next for it in iterables)
while pending:
try:
for next in nexts:
yield next()
except StopIteration:
pending -= 1
nexts = cycle(islice(nexts, pending))
Results = [open(f).readlines() for f in glob.glob("*.data")]
fout = csv.writer(open("res.csv", 'wb'), dialect="excel")
row = []
for line, c in zip(roundrobin(Results), cycle(range(len(Results)))):
splitline = line.split()
for item,currItem in zip(splitline, count(1)):
row[c+currItem] = item
if count == len(Results):
fout.writerow(row)
row = []
del fout
It should loop over each line of your input file and stitch them together as one row, which the csv library will write in the listed dialect.
I suggest to get used to csv module. The reason is that if the data is not that simple (simple strings in headings, and then numbers only) it is difficult to implement everything again. Try the following:
import csv
import glob
import os
datapath = './data'
resultpath = './result'
if not os.path.isdir(resultpath):
os.makedirs(resultpath)
# Initialize the empty rows. It does not check how many rows are
# in the file.
rows = []
# Read data from the files to the above matrix.
for fname in glob.glob(os.path.join(datapath, '*.data')):
with open(fname, 'rb') as f:
reader = csv.reader(f)
for n, row in enumerate(reader):
if len(rows) < n+1:
rows.append([]) # add another row
rows[n].extend(row) # append the elements from the file
# Write the data from memory to the result file.
fname = os.path.join(resultpath, 'result.csv')
with open(fname, 'wb') as f:
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
The with construct for opening a file can be replaced by the couple:
f = open(fname, 'wb')
...
f.close()
The csv.reader and csv.writer are simply wrappers that parse or compose the line of the file. The doc says that they require to open the file in the binary mode.

Categories

Resources