I am trying to read a huge binary file.
I have used chunks method and i am breaking my stream in chunks like:
with open("testUniform.bin", 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
dostuff(chunk)
All I am getting in chunk is this:
.y.$.!...
like stream,which I have attached a screenshot of.
I don't get the way out to convert it back to binary stream and I still don't know that why is this stream being converted into something like this.
Please help:
I am trying to convert a binary stream to a decimal values, but since being a huge file I cannot apply
f.read()
Here is my code attached:
from math import log,ceil,pow
from flask import Flask, request
from flask_restful import Resource, Api
import struct
app = Flask(__name__)
api = Api(app)
def binaryToDecimal(n):
return int(n,2)
def dostuff(inputarray):
args = request.args
lower_end_range = args['lower_end_range']
higher_end_range = args['higher_end_range']
amount = args['amount']
lower_end_range =int(lower_end_range)
higher_end_range=int(higher_end_range)
amount =int(amount)
#range_input is the range
range_input=higher_end_range-lower_end_range+1
#taking the log of the range to generate offset
log_of_range=log(range_input,2)
log_of_range=int(ceil(log_of_range))
higher_end_range_represented_by_bits = 0
lower_end_range_represented_by_bits = 0
lst = []
FinalRandomArray = []
#creating the maximum of numbers which it can go to by saving,for ex: 2^3+2^2+2^1+2^0
for i in range(0,(log_of_range)):
higher_end_range_represented_by_bits+=pow(2,i)
while True:
i=range_input%2
range_input=range_input/2
lst.append(i)
if range_input==0:
break
length = len(lst)
#where length is equal to the window size
for file in range(0,len(inputarray),length):
print(inputarray[0])
number=binaryToDecimal((inputarray[file]+inputarray[file+1]+inputarray[file+2]))+lower_end_range
if(number>=lower_end_range and number<=higher_end_range):
if(amount!=0):
FinalRandomArray.append(number)
amount-=1
return {'finalrandomarray':FinalRandomArray}
class ReturnMainModule(Resource):
def get(self):
with open("testUniform.bin", 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
dostuff(chunk)
api.add_resource(ReturnMainModule, '/main')
# Driver code
if __name__ == '__main__':
app.run(port='5004')
Related
I am trying to use a MPU-6000 accelerometer and Raspberry Pi Zero W to log vibration data in a windshield. I'm fairly new to Python so please bear with me.
I've written a python2 script that configures the MPU-6000 to communicate over I2C, with the clock configured to 400 kHz.
The MPU-6000 gives an interrupt when there is new data available in the accelerometer registers, which is read, converted to 2's complement and then written to a CSV file together with a timestamp. The output rate of the accelerometer is configured to be 1 kHz.
I'm experiencing that when sampling all three sensor axis the script isn't able to write all data points to the CSV file. Instead of a 1000 datapoints pr axis pr second I get approximately 650 datapoints pr axis pr second.
I've tried writing only one axis, which proved succesfull with 1000 datapoints pr second. I know that the MPU-6000 has a FIFO register available, which I probably can burst read to get 1000 samples/s without any problem. The problem will be obtaining a timestamp for each sample, so I haven't tried to implement reading from the FIFO register yet.
I will most likely do most of the post processing in Matlab, so the most important things the python script should do is to write sensor data in any form to a CSV file at the determined rate, with a timestamp.
Is there any way to further improve my Python script, so I can sample all three axis and write to a CSV file at a 1 kHz rate?
Parts of my script is depicted below:
#!/usr/bin/python
import smbus
import math
import csv
import time
import sys
import datetime
# Register addresses
power_mgmt_1 = 0x6b
power_mgmt_2 = 0x6c
samlerate_divider = 0x19
accel_config = 0x1C
INT_Enable = 0x38
def read_byte(reg):
return bus.read_byte_data(address, reg)
def read_word(reg):
h = bus.read_byte_data(address, reg)
l = bus.read_byte_data(address, reg+1)
value = (h <<8)+l
return value
def read_word_2c(reg):
val = read_word(reg)
if (val >= 0x8000):
return -((65535 - val) + 1)
else:
return val
csvwriter = None
def csv_open():
csvfile = open('accel-data.csv', 'a')
csvwriter = csv.writer(csvfile)
def csv_write(timedelta, accelerometerx, accelerometery, accelerometerz):
global csvwriter
csvwriter.writerow([timedelta, accelerometerx, accelerometery,
accelerometerz])
# I2C configs
bus = smbus.SMBus(1)
address = 0x69
#Power management configurations
bus.write_byte_data(address, power_mgmt_1, 0)
bus.write_byte_data(address, power_mgmt_2, 0x00)
#Configure sample-rate divider
bus.write_byte_data(address, 0x19, 0x07)
#Configure data ready interrupt:
bus.write_byte_data(address,INT_Enable, 0x01)
#Opening csv file and getting ready for writing
csv_open()
csv_write('Time', 'X_Axis', 'Y_Axis', 'Z_Axis')
print
print "Accelerometer"
print "---------------------"
print "Printing acccelerometer data: "
#starttime = datetime.datetime.now()
while True:
data_interrupt_read = bus.read_byte_data(address, 0x3A)
if data_interrupt_read == 1:
meas_time = datetime.datetime.now()
# delta_time = meas_time - starttime
accelerometer_xout = read_word_2c(0x3b)
accelerometer_yout = read_word_2c(0x3d)
accelerometer_zout = read_word_2c(0x3f)
# accelerometer_xout = read_word(0x3b)
# accelerometer_yout = read_word(0x3d)
# accelerometer_zout = read_word(0x3f)
# accelerometer_xout_scaled = accelerometer_xout / 16384.0
# accelerometer_yout_scaled = accelerometer_yout / 16384.0
# accelerometer_zout_scaled = accelerometer_zout / 16384.0
# csv_write(meas_time, accelerometer_xout_scaled,
accelerometer_yout_scaled, accelerometer_zout_scaled)
csv_write(meas_time, accelerometer_xout, accelerometer_yout,
accelerometer_zout)
continue
If the data you are trying to write is continuous, then the best approach is to minimise the amount of processing needed to write it and to also minimise the amount of data being written. To do this, a good approach would be to write the raw data into a binary formatted file. Each data word would then only require 2 bytes to be written. The datetime object can be converted into a timestamp which would need 4 bytes. So you would use a format such as:
[4 byte timestamp][2 byte x][2 byte y][2 byte z]
Python's struct library can be used to convert multiple variables into a single binary string which can be written to a file. The data appears to be signed, if this is the case, you could try writing the word as is, and then using the libraries built in support for signed values to read it back in later.
For example, the following could be used to write the raw data to a binary file:
#!/usr/bin/python
import smbus
import math
import csv
import time
import sys
import datetime
import struct
# Register addresses
power_mgmt_1 = 0x6b
power_mgmt_2 = 0x6c
samlerate_divider = 0x19
accel_config = 0x1C
INT_Enable = 0x38
def read_byte(reg):
return bus.read_byte_data(address, reg)
def read_word(reg):
h = bus.read_byte_data(address, reg)
l = bus.read_byte_data(address, reg+1)
value = (h <<8)+l
return value
# I2C configs
bus = smbus.SMBus(1)
address = 0x69
#Power management configurations
bus.write_byte_data(address, power_mgmt_1, 0)
bus.write_byte_data(address, power_mgmt_2, 0x00)
#Configure sample-rate divider
bus.write_byte_data(address, 0x19, 0x07)
#Configure data ready interrupt:
bus.write_byte_data(address, INT_Enable, 0x01)
print
print "Accelerometer"
print "---------------------"
print "Printing accelerometer data: "
#starttime = datetime.datetime.now()
bin_format = 'L3H'
with open('accel-data.bin', 'ab') as f_output:
while True:
#data_interrupt_read = bus.read_byte_data(address, 0x3A)
data_interrupt_read = 1
if data_interrupt_read == 1:
meas_time = datetime.datetime.now()
timestamp = time.mktime(meas_time.timetuple())
accelerometer_xout = read_word(0x3b)
accelerometer_yout = read_word(0x3d)
accelerometer_zout = read_word(0x3f)
f_output.write(struct.pack(bin_format, timestamp, accelerometer_xout, accelerometer_yout, accelerometer_zout))
Then later on, you could then convert the binary file to a CSV file using:
from datetime import datetime
import csv
import struct
bin_format = 'L3h' # Read data as signed words
entry_size = struct.calcsize(bin_format)
with open('accel-data.bin', 'rb') as f_input, open('accel-data.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Time', 'X_Axis', 'Y_Axis', 'Z_Axis'])
while True:
bin_entry = f_input.read(entry_size)
if len(bin_entry) < entry_size:
break
entry = list(struct.unpack(bin_format, bin_entry))
entry[0] = datetime.fromtimestamp(entry[0]).strftime('%Y-%m-%d %H:%M:%S')
csv_output.writerow(entry)
If your data collection is not continuous, you could make use of threads. One thread would read your data into a special queue. Another thread could read items out of the queue onto the disk.
If it is continuous this approach will fail if the writing of data is slower than the reading of it.
Take a look at the special Format characters used to tell struct how to pack and unpack the binary data.
I'm running a Python script on a Sun Grid Engine supercompute cluster that reads in a list of file ids, sends each to a worker process for analysis, and writes one output per input file to disk.
The trouble is I'm getting IOError(110, 'Connection timed out') somewhere inside the worker function, and I'm not sure why. I've received this error in the past when making network requests that were severely delayed, but in this case the worker is only trying to read data from disk.
My question is: What would cause a Connection timed out error when reading from disk, and how can one resolve this error? Any help others can offer would be very appreciated.
Full script (the IOError crops up in minhash_text()):
from datasketch import MinHash
from multiprocessing import Pool
from collections import defaultdict
from nltk import ngrams
import json
import sys
import codecs
import config
cores = 24
window_len = 12
step = 4
worker_files = 50
permutations = 256
hashband_len = 4
def minhash_text(args):
'''Return a list of hashband strings for an input doc'''
try:
file_id, path = args
with codecs.open(path, 'r', 'utf8') as f:
f = f.read()
all_hashbands = []
for window_idx, window in enumerate(ngrams(f.split(), window_len)):
window_hashbands = []
if window_idx % step != 0:
continue
minhash = MinHash(num_perm=permutations, seed=1)
for ngram in set(ngrams(' '.join(window), 3)):
minhash.update( ''.join(ngram).encode('utf8') )
hashband_vals = []
for i in minhash.hashvalues:
hashband_vals.append(i)
if len(hashband_vals) == hashband_len:
window_hashbands.append( '.'.join([str(j) for j in hashband_vals]) )
hashband_vals = []
all_hashbands.append(window_hashbands)
return {'file_id': file_id, 'hashbands': all_hashbands}
except Exception as exc:
print(' ! error occurred while processing', file_id, exc)
return {'file_id': file_id, 'hashbands': []}
if __name__ == '__main__':
file_ids = json.load(open('file_ids.json'))
file_id_path_tuples = [(file_id, path) for file_id, path in file_ids.items()]
worker_id = int(sys.argv[1])
worker_ids = list(ngrams(file_id_path_tuples, worker_files))[worker_id]
hashband_to_ids = defaultdict(list)
pool = Pool(cores)
for idx, result in enumerate(pool.imap(minhash_text, worker_ids)):
print(' * processed', idx, 'results')
file_id = result['file_id']
hashbands = result['hashbands']
for window_idx, window_hashbands in enumerate(hashbands):
for hashband in window_hashbands:
hashband_to_ids[hashband].append(file_id + '.' + str(window_idx))
with open(config.out_dir + 'minhashes-' + str(worker_id) + '.json', 'w') as out:
json.dump(dict(hashband_to_ids), out)
It turned out I was hammering the filesystem too hard, making too many concurrent read requests for files on the same server. That server could only allow a fixed number of reads in a given period, so any requests over that limit received a Connection Timed Out response.
The solution was to wrap each file read request in a while loop. Inside that while loop, try to read the appropriate file from disk. If the Connection timed out error springs, sleep for a second and try again. Only once the file has been read may the while loop be broken.
First of all, I have to say that I am beginer in Python programming.I have connected sensor to RPi 3 UART port. I found out in Internet working program for this sensor. With a little bit my modifications, it writes me to file result of measuring. The code is below:
import serial
import time
import sys
import json
import datetime
import binascii
class pmsA003():
def __init__(self, dev):
self.serial = serial.Serial(dev, baudrate=9600,
timeout=3)
def __exit__(self, exc_type, exc_value, traceback):
self.serial.close()
def setIdel(self):
idelcmd = b'\x42\x4d\xe4\x00\x00\x01\x73'
ary = bytearray(idelcmd)
self.serial.write(ary)
def setNormal(self):
normalcmd = b'\x42\x4d\xe4\x00\x01\x01\x74'
ary = bytearray(normalcmd)
self.serial.write(ary)
def vertify_data(self):
if not self.data:
return False
return True
def read_data(self):
while True:
b = self.serial.read(1)
if b == b'\x42':
data = self.serial.read(31)
if data[0] == b'\x4d':
self.data = bytearray(b'\x42' + data)
if self.vertify_data():
return self._PMdata()
def _PMdata(self):
d = {}
d['apm10'] = self.data[4] * 256 + self.data[5]
d['apm25'] = self.data[6] * 256 + self.data[7]
d['apm100'] = self.data[8] * 256 + self.data[9]
return d
if __name__ == '__main__':
con = pmsA003('/dev/ttyAMA0')
d = con.read_data()
print(d)
with open('/home/pi/ramdisk/PMA003', 'a') as f:
f.write("%s" % (d))
As the result, I got on console and in the file string like this:
{'apm10': 150, 'apm100': 244, 'apm25': 228}
But my goal is to have CSV like file in the form:
,value_of_apm10, value_of_apm25, value_of_apm100
Can anybody help me to modify the code above?
You need to import the Python CSV library and then create a csv.DictWriter object. (See the Python documentation on how to do this - it's fairly straightforward).
When you create the DictWriter you give it a list of the column headings you require. Then for each record you want to write to the CSV file you simply create a dictionary with keys corresponding to the column heading and call the DictWriter's writerow() method.
There's an example of exactly what you need to do here.
I am able to generate and stream text on the fly, but unable to generate and stream a compressed file on the fly.
from flask import Flask, request, Response,stream_with_context
import zlib
import gzip
app = Flask(__name__)
def generate_text():
for x in range(10000):
yield f"this is my line: {x}\n".encode()
#app.route('/stream_text')
def stream_text():
response = Response(stream_with_context(generate_text()))
return response
def generate_zip():
for x in range(10000):
yield zlib.compress(f"this is my line: {x}\n".encode())
#app.route('/stream_zip')
def stream_zip():
response = Response(stream_with_context(generate_zip()), mimetype='application/zip')
response.headers['Content-Disposition'] = 'attachment; filename=data.gz'
return response
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8000, debug=True)
Than using curl and gunzip:
curl http://127.0.0.1:8000/stream_zip > data.gz
gunzip data.gz
gunzip: data.gz: not in gzip format
I don't care if it is zip, gzip, or any other type of compression.
generate_text in my real code generates over 4 GB of data so I would like to compress on the fly.
Saving text to file, zipping, returning zip file, and than deleting is not the solution I'm after.
I need to be in a loop generating some text -> compress that text -> streaming compressed data until I'm done.
zip/gzip ... anything is fine as long as it works.
You are yielding a series of compressed documents, not a single compressed stream. Don't use zlib.compress(), it includes the header and forms a single document.
You need to create a zlib.compressobj() object instead, and use the Compress.compress() method on that object to produce a stream of data (followed by a final call to Compress.flush()):
def generate_zip():
compressor = zlib.compressobj()
for x in range(10000):
chunk = compressor.compress(f"this is my line: {x}\n".encode())
if chunk:
yield chunk
yield compressor.flush()
The compressor can produce empty blocks when there is not enough data yet to produce a full compressed-data chunk, the above only yields if there is actually anything to send. Because your input data is so highly repetitive and thus the data can be efficiently compressed, this yields only 3 times (once with 2-byte header, once with about 21kb of compressed data covering the first 8288 iterations over range(), and finally with the remaining 4kb for the rest of the loop).
In aggregate, this produces the same data as a single zlib.compress() call with all inputs concatenated. The correct mime-type for this data format is application/zlib, not application/zip.
This format is not readily decompressible with gzip however, not without some trickery. That's because the above doesn't yet produce a GZIP file, it just produces a raw zlib-compressed stream. To make it GZIP compatible, you need to configure the compression correctly, send a header first, and add a CRC checksum and data length value at the end:
import zlib
import struct
import time
def generate_gzip():
# Yield a gzip file header first.
yield bytes([
0x1F, 0x8B, 0x08, 0x00, # Gzip file, deflate, no filename
*struct.pack('<L', int(time.time())), # compression start time
0x02, 0xFF, # maximum compression, no OS specified
])
# bookkeeping: the compression state, running CRC and total length
compressor = zlib.compressobj(
9, zlib.DEFLATED, -zlib.MAX_WBITS, zlib.DEF_MEM_LEVEL, 0)
crc = zlib.crc32(b"")
length = 0
for x in range(10000):
data = f"this is my line: {x}\n".encode()
chunk = compressor.compress(data)
if chunk:
yield chunk
crc = zlib.crc32(data, crc) & 0xFFFFFFFF
length += len(data)
# Finishing off, send remainder of the compressed data, and CRC and length
yield compressor.flush()
yield struct.pack("<2L", crc, length & 0xFFFFFFFF)
Serve this as application/gzip:
#app.route('/stream_gzip')
def stream_gzip():
response = Response(stream_with_context(generate_gzip()), mimetype='application/gzip')
response.headers['Content-Disposition'] = 'attachment; filename=data.gz'
return response
and the result can be decompressed on the fly:
curl http://127.0.0.1:8000/stream_gzip | gunzip -c | less
While I was extremely impressed by Martijn's solution, I decided to roll my own one that uses pigz for better performance:
def yield_pigz(results, compresslevel=1):
cmd = ['pigz', '-%d' % compresslevel]
pigz_proc = subprocess.Popen(cmd, bufsize=0,
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
def f():
for result in results:
pigz_proc.stdin.write(result)
pigz_proc.stdin.flush()
pigz_proc.stdin.close()
try:
t = threading.Thread(target=f)
t.start()
while True:
buf = pigz_proc.stdout.read(4096)
if len(buf) == 0:
break
yield buf
finally:
t.join()
pigz_proc.wait()
Keep in mind that you'll need to import subprocess and threading for this to work. You will also need to install pigz program (already in repositories of most Linux distributions -- on Ubuntu, just use sudo apt install pigz -y).
Example usage:
from flask import Flask, Response
import subprocess
import threading
import random
app = Flask(__name__)
def yield_something_random():
for i in range(10000):
seq = [chr(random.randint(ord('A'), ord('Z'))) for c in range(1000)]
yield ''.join(seq)
#app.route('/')
def index():
return Response(yield_pigz(yield_something_random()))
I think that currently you just sending the generator instead of the data!
You may want to do something like this (I haven't tested it, so may need some change):
def generate_zip():
import io
with gzip.GzipFile(fileobj=io.BytesIO(), mode='w') as gfile:
for x in xrange(10000):
gfile.write("this is my line: {}\n".format(x))
return gfile.read()
Working generate_zip() with low memory consumption :) :
def generate_zip():
buff = io.BytesIO()
gz = gzip.GzipFile(mode='w', fileobj=buff)
for x in xrange(10000):
gz.write("this is my line: {}\n".format(x))
yield buff.read()
buff.truncate()
gz.close()
yield buff.getvalue()
Consider the following program:
#!/usr/bin/env pypy
import json
import cStringIO
import sys
def main():
BUFSIZE = 10240
f = sys.stdin
decoder = json.JSONDecoder()
io = cStringIO.StringIO()
do_continue = True
while True:
read = f.read(BUFSIZE)
if len(read) < BUFSIZE:
do_continue = False
io.write(read)
try:
data, offset = decoder.raw_decode(io.getvalue())
print(data)
rest = io.getvalue()[offset:]
if rest.startswith('\n'):
rest = rest[1:]
decoder = json.JSONDecoder()
io = cStringIO.StringIO()
io.write(rest)
except ValueError, e:
#print(e)
#print(repr(io.getvalue()))
continue
if not do_continue:
break
if __name__ == '__main__':
main()
And here's a test case:
$ yes '{}' | pv | pypy parser-test.py >/dev/null
As you can see, the following script slows down when you add more input to it. This also happens with cPython. I tried to profile the script using mprof and cProfile, but I found no hint on why is that. Does anybody have a clue?
Apparently the string operations slowed it down. Instead of:
data, offset = decoder.raw_decode(io.getvalue())
print(data)
rest = io.getvalue()[offset:]
if rest.startswith('\n'):
rest = rest[1:]
It is better to do:
data, offset = decoder.raw_decode(io.read())
print(data)
rest = io.getvalue()[offset:]
io.truncate()
io.write(rest)
if rest.startswith('\n'):
io.seek(1)
You may want to close your StringIO at the end of the iteration (after writing).
io.close()
The memory buffer for a StringIO will free once it is closed, but will stay open otherwise. This would explain why each additional input is slowing your script down.