Too many values to unpack python gbf file - python

Hello i got python error too many values to unpack on python when i run my script
Traceback:
File "C:\Python27.1\perpetuum.py", line 193, in __init__
header_offset, header_length = struct.unpack('8sii', f.read(16))
ValueError: too many values to unpack.
Code:
class DataFile(object):
"""GBF file reader"""
def __init__(self, filename):
self.filename = filename
with open(filename, 'rb') as f:
header_offset, header_length = struct.unpack('8sii', f.read(16))
if magic != 'GXYDATA\x1a':
raise Exception('Invalid data file (wrong magic)', magic)
header = self._get_record(header_offset, header_length)
self._records = self._parse_header(header)
def _decode(self, data):
try:
import numpy as np
i = np.arange(len(data), dtype=np.byte)
buf = np.frombuffer(data, np.byte) ^ ((i + 1) * (i ^ -54) - 84)
return buf.tostring()
except ImportError:
buf = array.array('B', data)
for i in xrange(len(data)):
buf[i] = 0xff & (buf[i] ^ ((i + 1) * (i ^ 0xca) - 84))
return buf.tostring()
def _get_record(self, offset, length):
with open(self.filename, 'rb') as f:
f.seek(offset)
data = f.read(length)
return self._decode(data)
def _parse_header(self, header):
"""
header record format:
int numRecords
for each record:
char[nameLen] nameSZ, 0
int offset
int length
int unkLength
"""
records = {}
num_records = struct.unpack_from('i', header)[0]
pos = 4
for i in xrange(num_records):
name_end = header.find('\0', pos)
name = header[pos:name_end]
pos = name_end + 1
offset, length, unkLength = struct.unpack_from('iii', header, pos)
pos += 12
# f1Length = min(13, unkLength)
# f1 = header[pos:pos+f1Length]
pos += unkLength
records[name] = (offset, length)
return records
PREFIX_MAP = {'\x89PNG': '.png',
'DDS ': '.dds',
'A3MF': '.a3m',
'#': '.txt',
'=': '.txt',
'Extended Module': '.xm',
'RIFF': '.wav',
'OggS': '.ogg'}
def _guess_ext(self, name, data):
for prefix, ext in self.PREFIX_MAP.iteritems():
if data.startswith(prefix):
return ext
return '.bin'
CATEGORY_MAP = OrderedDict([
('def*.png', 'icons'),
('icon*.png', 'icons'),
('entityIcon*.png', 'icons'),
('noIcon*.png', 'icons'),
('gfx_*.png', 'gfx'),
('*.a3m', 'models'),
('snd_*', 'sound'),
('altitude*', 'terrain'),
('terrain*', 'terrain'),
('altitude0*', 'terrain'),
('blocks0*', 'terrain'),
('control0*', 'terrain'),
('plants0*', 'terrain'),
('surface0*', 'terrain'),
('tactical*.png', 'tactical_icons'),
('font*', 'font'),
('textures_*.dds', 'textures'),
('corp*.png', 'corp_icons'),
('credits.txt', 'misc'),
('eula*.txt', 'misc'),
('*.txt', 'text_data')])
def dump_record(self, name, dest_dir, sort=False):
offset, length = self._records[name]
print '%08x: %s (%.2f KB)' % (offset, name, length / 1024.)
data = self._get_record(offset, length)
name += self._guess_ext(name, data)
if sort:
for pattern, category in self.CATEGORY_MAP.iteritems():
if fnmatch.fnmatch(name, pattern):
dest_dir = os.path.join(dest_dir, category)
try:
os.makedirs(dest_dir)
except OSError:
pass
break
rec_filename = os.path.join(dest_dir, name)
with open(rec_filename, 'wb') as f:
f.write(data)
def dump_records(self, patterns, dest_dir, sort=False):
for name in self._records:
if any(fnmatch.fnmatch(name, pattern) for pattern in patterns):
self.dump_record(name, dest_dir, sort)
any ideas ?

The code
struct.unpack('8sii', f.read(16))
unpacks into three values - a 8-byte string, and two integers each of which has length of 4 bytes, while your LHS has only two variables.
See the struct documentation for its format string.

This line is making an incorrect assumption:
header_offset, header_length = struct.unpack('8sii', f.read(16))
You are probably better off checking the size of the tuple returned from struct.unpack and then dealing with the results in a conditional manner.
Note that according to the docs, this method always returns a tuple, even if that tuple is of length one.

Stupid i remove if magic, before header offset.

drhagen stupidity of human take -1 and no answer at all get my skill at photoshop and I could laugh of you any time

Related

How to read a string in a bitstream in Python

I'm writing the following string in my output file:
bitstream.add("stri", 32)
where
def add(self, data, length):
s = ''
if (type(data) == str):
for char in data:
b = bin(ord(char))[2:]
s = s + "{:0>8}".format(b)
else:
s = bin(data)[2:]
if (len(s) < length):
resto = length - len(s)
for _ in range(0, resto):
s = '0' + s
s = s[0:length]
self.cache = self.cache + s
self.flush()
Later on I need to read the string from the output file. I use Python struct unpack module as follows:
from struct import unpack
key_length_bytes = 32
key = ""
for _ in range(0, key_length_bytes):
carattere = chr(unpack('>B', in_file.read(1))[0])
key = "%s%s" (key, carattere)
I get
key = "%s%s" (key, carattere)
TypeError: 'str' object is not callable
Thank you for any help you could provide.
You're missing a % sign.
key = "%s%s" (key, carattere) needs to be changed into
key = "%s%s" % (key, carattere)

Parsing Event Information Table files

My dreambox compatible video recorder stores event information table ".eit" files with every recording. I'd like to work with this information to rearrange my recordings.
A similar question came up in http://www.i-have-a-dreambox.com/wbb2/thread.php?threadid=186234&sid=3b36acb1ba62e4724cb47216ce08a564
The format seems to be a binary format as outlined in:
https://de.wikipedia.org/wiki/Event_Information_Table
and
http://www.etsi.org/deliver/etsi_en/300400_300499/300468/01.14.01_60/en_300468v011401p.pdf
I am now looking for a parser for such files. Where could I find one that works with files and does not assume a broadcast stream as input?
What did i try so far?
I searched the web and found the following links and pointers:
There seems to be a java library
https://docs.oracle.com/javame/config/cdc/opt-pkgs/api/jsr927/javax/tv/service/guide/ProgramEvent.html
which is part of the JSR 927 https://jcp.org/en/jsr/detail?id=927
specification.
As it looks this libary is only available for Java-ME see https://en.wikipedia.org/wiki/Java_TV
If found some dvb related EIT code snippets e.g.
https://github.com/jinfeng-geeya/3202C/blob/master/SRC/lib/libdvb/libepg/eit_parser.c
or
http://linuxtv.org/docs/libdvbv5/eit_8h.html
as part of the Kamaelia DVB Tools Project http://www.kamaelia.org/Developers/Projects/DVBTools.html there seems to be a python solution:
http://pydoc.net/Python/Kamaelia/0.6.0/Kamaelia.Device.DVB.Parse.ParseEventInformationTable/
The closest I found so far was from a hint athttp://forums.openpli.org/topic/29141-eit-file-format/ which points to:
https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py
Currently I am pursuing to go from this Open Source Python Code.
This is a Python script that seems to be a valid start.
It's available as opensource at https://github.com/WolfgangFahl/eitparser where you'll find the latest python3 compatible version and documentation.
When you call it with
python EitParser.py SomeEitFile
it will print out the name and description of the eit file.
Add you language codes as you need e.g. from https://github.com/libo/Enigma2/blob/master/lib/python/Tools/ISO639.py
#!/usr/bin/python
# encoding: utf-8
#
# EitSupport
# Copyright (C) 2011 betonme
# Copyright (C) 2016 Wolfgang Fahl
#
# This EITParser is based on:
# https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py
#
# In case of reuse of this source code please do not remove this copyright.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# For more information on the GNU General Public License see:
# <http://www.gnu.org/licenses/>.
#
import os
import struct
import time
from datetime import datetime
#from Components.config import config
#from Components.Language import language
#from EMCTasker import emcDebugOut
#from IsoFileSupport import IsoSupport
#from MetaSupport import getInfoFile
#def crc32(data):
# poly = 0x4c11db7
# crc = 0xffffffffL
# for byte in data:
# byte = ord(byte)
# for bit in range(7,-1,-1): # MSB to LSB
# z32 = crc>>31 # top bit
# crc = crc << 1
# if ((byte>>bit)&1) ^ z32:
# crc = crc ^ poly
# crc = crc & 0xffffffffL
# return crc
decoding_charSpecHR = {u'Ć': u'\u0106', u'æ': u'\u0107', u'®': u'\u017D', u'¾': u'\u017E', u'©': u'\u0160', u'¹': u'\u0161', u'Č': u'\u010C', u'è': u'\u010D', u'ð': u'\u0111'}
decoding_charSpecCZSK = {u'Ï'+u'C': u'Č',u'Ï'+u'E': u'Ě',u'Ï'+u'L': u'Ľ',u'Ï'+u'N': u'Ň',u'Ï'+u'R': u'Ř',u'Ï'+u'S': u'Š',u'Ï'+u'T': u'Ť',u'Ï'+u'Z': u'Ž',u'Ï'+u'c': u'č',u'Ï'+u'd': u'ď',u'Ï'+u'e': u'ě',u'Ï'+u'l': u'ľ', u'Ï'+u'n': u'ň',
u'Ï'+u'r': u'ř',u'Ï'+u's': u'š',u'Ï'+u't': u'ť',u'Ï'+u'z': u'ž',u'Ï'+u'D': u'Ď',u'Â'+u'A': u'Á',u'Â'+u'E': u'É',u'Â'+u'I': u'Í',u'Â'+u'O': u'Ó',u'Â'+u'U': u'Ú',u'Â'+u'a': u'á',u'Â'+u'e': u'é',u'Â'+u'i': u'í',u'Â'+u'o': u'ó',
u'Â'+u'u': u'ú',u'Â'+u'y': u'ý',u'Ã'+u'o': u'ô',u'Ã'+u'O': u'Ô',u'Ê'+u'u': u'ů',u'Ê'+u'U': u'Ů',u'È'+u'A': u'Ä',u'È'+u'E': u'Ë',u'È'+u'I': u'Ï',u'È'+u'O': u'Ö',u'È'+u'U': u'Ü',u'È'+u'Y': u'Ÿ',u'È'+u'a': u'ä',u'È'+u'e': u'ë',
u'È'+u'i': u'ï',u'È'+u'o': u'ö',u'È'+u'u': u'ü',u'È'+u'y': u'ÿ'}
def convertCharSpecHR(text):
for i, j in decoding_charSpecHR.iteritems():
text = text.replace(i, j)
return text
def convertCharSpecCZSK(text):
for i, j in decoding_charSpecCZSK.iteritems():
text = text.replace(i, j)
return text
def parseMJD(MJD):
# Parse 16 bit unsigned int containing Modified Julian Date,
# as per DVB-SI spec
# returning year,month,day
YY = int( (MJD - 15078.2) / 365.25 )
MM = int( (MJD - 14956.1 - int(YY*365.25) ) / 30.6001 )
D = MJD - 14956 - int(YY*365.25) - int(MM * 30.6001)
K=0
if MM == 14 or MM == 15: K=1
return (1900 + YY+K), (MM-1-K*12), D
def unBCD(byte):
return (byte>>4)*10 + (byte & 0xf)
#from Tools.ISO639 import LanguageCodes
# -*- coding: iso-8859-2 -*-
LanguageCodes = { }
LanguageCodes["deu"] = LanguageCodes["ger"] = LanguageCodes["de"] = ("German", "Germanic")
LanguageCodes["fra"] = LanguageCodes["fre"] = LanguageCodes["fr"] = ("French", "Romance")
def language_iso639_2to3(alpha2):
ret = alpha2
if alpha2 in LanguageCodes:
language = LanguageCodes[alpha2]
for alpha, name in LanguageCodes.items():
if name == language:
if len(alpha) == 3:
return alpha
return ret
#TEST
#print LanguageCodes["sv"]
#print language_iso639_2to3("sv")
# Eit File support class
# Description
# http://de.wikipedia.org/wiki/Event_Information_Table
class EitList():
EIT_SHORT_EVENT_DESCRIPTOR = 0x4d
EIT_EXTENDED_EVENT_DESCRIPOR = 0x4e
def __init__(self, path=None):
self.eit_file = None
#TODO
# The dictionary implementation could be very slow
self.eit = {}
self.iso = None
self.__newPath(path)
self.__readEitFile()
def __newPath(self, path):
name = None
if path:
if self.eit_file != path:
self.eit_file = path
def __mk_int(self, s):
return int(s) if s else 0
def __toDate(self, d, t):
if d and t:
#TODO Is there another fast and safe way to get the datetime
try:
return datetime(int(d[0]), int(d[1]), int(d[2]), int(t[0]), int(t[1]))
except ValueError:
return None
else:
return None
##############################################################################
## Get Functions
def getEitsid(self):
return self.eit.get('service', "") #TODO
def getEitTsId(self):
return self.eit.get('transportstream', "") #TODO
def getEitWhen(self):
return self.eit.get('when', "")
def getEitStartDate(self):
return self.eit.get('startdate', "")
def getEitStartTime(self):
return self.eit.get('starttime', "")
def getEitDuration(self):
return self.eit.get('duration', "")
def getEitName(self):
return self.eit.get('name', "").strip()
def getEitDescription(self):
return self.eit.get('description', "").strip()
# Wrapper
def getEitShortDescription(self):
return self.getEitName()
def getEitExtendedDescription(self):
return self.getEitDescription()
def getEitLengthInSeconds(self):
length = self.eit.get('duration', "")
#TODO Is there another fast and safe way to get the length
if len(length)>2:
return self.__mk_int((length[0]*60 + length[1])*60 + length[2])
elif len(length)>1:
return self.__mk_int(length[0]*60 + length[1])
else:
return self.__mk_int(length)
def getEitDate(self):
return self.__toDate(self.getEitStartDate(), self.getEitStartTime())
##############################################################################
## File IO Functions
def __readEitFile(self):
data = ""
path = self.eit_file
#lang = language.getLanguage()[:2]
lang = language_iso639_2to3( "de" )
#print lang + str(path)
if path and os.path.exists(path):
#print "Reading Event Information Table " + str(path)
# Read data from file
# OE1.6 with Pyton 2.6
#with open(self.eit_file, 'r') as file: lines = file.readlines()
f = None
try:
f = open(path, 'rb')
#lines = f.readlines()
data = f.read()
except Exception, e:
emcDebugOut("[META] Exception in readEitFile: " + str(e))
finally:
if f is not None:
f.close()
# Parse the data
if data and 12 <= len(data):
# go through events
pos = 0
e = struct.unpack(">HHBBBBBBH", data[pos:pos+12])
event_id = e[0]
date = parseMJD(e[1]) # Y, M, D
time = unBCD(e[2]), unBCD(e[3]), unBCD(e[4]) # HH, MM, SS
duration = unBCD(e[5]), unBCD(e[6]), unBCD(e[7]) # HH, MM, SS
running_status = (e[8] & 0xe000) >> 13
free_CA_mode = e[8] & 0x1000
descriptors_len = e[8] & 0x0fff
if running_status in [1,2]:
self.eit['when'] = "NEXT"
elif running_status in [3,4]:
self.eit['when'] = "NOW"
self.eit['startdate'] = date
self.eit['starttime'] = time
self.eit['duration'] = duration
pos = pos + 12
short_event_descriptor = []
short_event_descriptor_multi = []
extended_event_descriptor = []
extended_event_descriptor_multi = []
component_descriptor = []
content_descriptor = []
linkage_descriptor = []
parental_rating_descriptor = []
endpos = len(data) - 1
while pos < endpos:
rec = ord(data[pos])
length = ord(data[pos+1]) + 2
if rec == 0x4D:
descriptor_tag = ord(data[pos+1])
descriptor_length = ord(data[pos+2])
ISO_639_language_code = str(data[pos+3:pos+5])
event_name_length = ord(data[pos+5])
short_event_description = data[pos+6:pos+6+event_name_length]
if ISO_639_language_code == lang:
short_event_descriptor.append(short_event_description)
short_event_descriptor_multi.append(short_event_description)
elif rec == 0x4E:
ISO_639_language_code = str(data[pos+3:pos+5])
extended_event_description = ""
extended_event_description_multi = ""
for i in range (pos+8,pos+length):
if str(ord(data[i]))=="138":
extended_event_description += '\n'
extended_event_description_multi += '\n'
else:
if data[i]== '\x10' or data[i]== '\x00' or data[i]== '\x02':
pass
else:
extended_event_description += data[i]
extended_event_description_multi += data[i]
if ISO_639_language_code == lang:
extended_event_descriptor.append(extended_event_description)
extended_event_descriptor_multi.append(extended_event_description)
elif rec == 0x50:
component_descriptor.append(data[pos+8:pos+length])
elif rec == 0x54:
content_descriptor.append(data[pos+8:pos+length])
elif rec == 0x4A:
linkage_descriptor.append(data[pos+8:pos+length])
elif rec == 0x55:
parental_rating_descriptor.append(data[pos+2:pos+length])
else:
#print "unsopported descriptor: %x %x" %(rec, pos + 12)
#print data[pos:pos+length]
pass
pos += length
# Very bad but there can be both encodings
# User files can be in cp1252
# Is there no other way?
if short_event_descriptor:
short_event_descriptor = "".join(short_event_descriptor)
else:
short_event_descriptor = "".join(short_event_descriptor_multi)
if short_event_descriptor:
#try:
# short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8")
#except UnicodeDecodeError:
# pass
try:
short_event_descriptor.decode('utf-8')
except UnicodeDecodeError:
try:
short_event_descriptor = short_event_descriptor.decode("cp1252").encode("utf-8")
except UnicodeDecodeError:
# do nothing, otherwise cyrillic wont properly displayed
#short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8")
pass
if (lang == "cs") or (lang == "sk"):
short_event_descriptor = str(convertCharSpecCZSK(short_event_descriptor))
if (lang == "hr"):
short_event_descriptor = str(convertCharSpecHR(short_event_descriptor))
self.eit['name'] = short_event_descriptor
# Very bad but there can be both encodings
# User files can be in cp1252
# Is there no other way?
if extended_event_descriptor:
extended_event_descriptor = "".join(extended_event_descriptor)
else:
extended_event_descriptor = "".join(extended_event_descriptor_multi)
if extended_event_descriptor:
#try:
# extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8")
#except UnicodeDecodeError:
# pass
try:
extended_event_descriptor.decode('utf-8')
except UnicodeDecodeError:
try:
extended_event_descriptor = extended_event_descriptor.decode("cp1252").encode("utf-8")
except UnicodeDecodeError:
# do nothing, otherwise cyrillic wont properly displayed
#extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8")
pass
if (lang == "cs") or (lang == "sk"):
extended_event_descriptor = str(convertCharSpecCZSK(extended_event_descriptor))
if (lang == "hr"):
extended_event_descriptor = str(convertCharSpecHR(extended_event_descriptor))
self.eit['description'] = extended_event_descriptor
else:
# No date clear all
self.eit = {}
"""Module docstring.
Read Eit File and show the information.
"""
import sys
import getopt
def readeit(eitfile):
eitlist=EitList(eitfile)
print eitlist.getEitName();
print eitlist.getEitStartDate();
print eitlist.getEitDescription();
def main():
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "h", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
# process arguments
for arg in args:
readeit(arg) # process() is defined elsewhere
if __name__ == "__main__":
main()

Attribute error while reading wav file

I am new in python and i try to build a program that know to decode and encode dual-tone multiple-frequency (DTMF) signals used to dial a telephone.
for now the encoding part is working good but for some reason the encoding is not working and i get the follwoing exception
Traceback (most recent call last):
File "C:\Users\matant\workspace\dialer2\dialer.py", line 239, in <module>
x = d.decoder()
File "C:\Users\matant\workspace\dialer2\dialer.py", line 218, in decoder
data = self.read_wav()
File "C:\Users\matant\workspace\dialer2\dialer.py", line 201, in read_wav
n = fin.getnframes()
AttributeError: 'file' object has no attribute 'getnframes'
as you can see i writing frames into the file so i dont understand why its happend:
this is my code:
'''
Created on Jan 10, 2016
#author: matant
'''
import json
from math import pi, sin
import wave
import logging
import struct
import os
ROW_FREQ = (697, 770, 852, 941)
COL_FREQ = (1209, 1336, 1477, 1633)
SAMPLE_RATE = 44100
SAMPLE_WIDTH = 2
NUMBER_OF_CHANNELS = 1
COMPRESSION_TYPE = "NONE"
COMPRESSION_NAME = "Uncompressed"
PI2 = 6.283185306
scale = 32767 #16-bit unsigned short
keys= '1','2','3','A',\
'4','5','6','B',\
'7','8','9','C',\
'*','0','#','D'
FREQUENCY_MAP = dict()
FREQUENCY_MAP['1'] = (697, 1209)
FREQUENCY_MAP['2'] = (697, 1336)
FREQUENCY_MAP['3'] = (697, 1477)
FREQUENCY_MAP['A'] = (697, 1633)
FREQUENCY_MAP['4'] = (770, 1209)
FREQUENCY_MAP['5'] = (770, 1336)
FREQUENCY_MAP['6'] = (770, 1477)
FREQUENCY_MAP['B'] = (770, 1633)
FREQUENCY_MAP['7'] = (852, 1209)
FREQUENCY_MAP['8'] = (852, 1336)
FREQUENCY_MAP['9'] = (852, 1477)
FREQUENCY_MAP['C'] = (852, 1633)
FREQUENCY_MAP['*'] = (941, 1209)
FREQUENCY_MAP['0'] = (941, 1336)
FREQUENCY_MAP['#'] = (941, 1477)
FREQUENCY_MAP['D'] = (941, 1633)
FREQUENCY_MAP['S'] = (0, 0)
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s <%(levelname)s> %(module)s.%(funcName)s() %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
log = logging.getLogger(__name__)
class DTMF:
VALID_SEQUENCE_TYPES = [list, tuple, set]
def __init__(self, input_string=None, input_list=None):
"""
Initializes a DTMF instance with an option DTMF sequence. This can be a list of lists or a json string.
If both are supplied, it tries to parse the json_string. If it does, it uses that. If there are errors, it
validates the list and tries to use that. Basically input_string takes precedence.
General workflow would be setting dtmf_sequence and calling generate_raw_data. This data can then be saved to a
.wav file or compressed and saved as other, smaller, file formats.
:param input_list: list of lists or tuples of the form [['A', 100], ['S', 50], ['2', 100], ['S', 50]]
:param input_string: json_string of the form '[["A", 100], ["S", 50], ["2", 100], ["S", 50]]'
"""
log.debug("Creating instance of DTMF")
log.debug("input_string = {}".format(input_string))
log.debug("input_list = {}".format(input_list))
self._dtmf_sequence = None
self._raw_data = None
if input_string is not None:
converted_json_sequence = self.parse_json_string(input_string)
self._dtmf_sequence = converted_json_sequence
elif input_list is not None:
self._dtmf_sequence = input_list
#property
def dtmf_sequence(self):
return self._dtmf_sequence
#dtmf_sequence.setter
def dtmf_sequence(self, input_sequence):
if type(input_sequence) == str:
input_sequence = self.parse_json_string(input_sequence)
if type(input_sequence) == list:
if self._dtmf_sequence_is_valid(input_sequence):
self._dtmf_sequence = input_sequence
log.debug("Set _dtmf_sequence to {}".format(self._dtmf_sequence))
def parse_json_string(self, input_string):
return json.loads(input_string)
def generate_raw_data(self):
"""
Generates raw data that can be saved into a .wav file. This can take some time to generate.
:raise AttributeError: If no dtmf sequence has been set
"""
_data = list()
if self._dtmf_sequence is None:
raise AttributeError("No dtmf sequence set")
for tone_tuple in self._dtmf_sequence:
key = tone_tuple[0]
tone_duration = tone_tuple[1]
f1 = FREQUENCY_MAP[key][0]
f2 = FREQUENCY_MAP[key][1]
_data += (self.generate_tone(f1, f2, tone_duration))
self._raw_data = _data
def save_wave_file(self, file_path):
if self._raw_data is None or len(self._raw_data) < 1:
self.generate_raw_data()
f = wave.open(file_path, 'w')
f.setnchannels(NUMBER_OF_CHANNELS)
f.setsampwidth(SAMPLE_WIDTH)
f.setframerate(SAMPLE_RATE)
f.setnframes(len(self._raw_data))
f.setcomptype(COMPRESSION_TYPE, COMPRESSION_NAME)
log.info("Saving wav file {} THIS MAY TAKE A WHILE".format(file_path))
for i in self._raw_data:
f.writeframes(struct.pack('i', i))
log.info("Saved file to {0}".format(file_path))
f.close()
#staticmethod
def dtmf_sequence_is_valid(input_list):
"""
Validates an input sequence for proper structure and contents.
:param input_list:
:return:
"""
if type(input_list) is not list:
log.warning('input_list must be a list instance')
return False
if [(type(item) in DTMF.VALID_SEQUENCE_TYPES) for item in input_list].count(False) != 0:
log.warning('input_list contains invalid sequence type')
return False
for item in input_list:
if type(item[0]) != str or type(item[1]) != int:
log.debug("Type list[0]: {}".format(type(item[0])))
log.debug("Type list[1]: {}".format(type(item[1])))
log.warning('input_list must contain a list of sequences of [str, int]')
return False
return True
#staticmethod
def generate_tone(f1, f2, _duration_in_ms):
"""
Generates a single value representing a sample of two combined frequencies.
:param f1:
:param f2:
:param _duration_in_ms:
:return:
"""
assert f1 in ROW_FREQ or f1 == 0
assert f2 in COL_FREQ or f2 == 0
number_of_samples = int(SAMPLE_RATE * _duration_in_ms / 1000)
scale = 32767 # signed int / 2
result = list()
for i in range(number_of_samples):
p = i * 1.0 / SAMPLE_RATE
result.append(int((sin(p * f1 * pi * 2) + sin(p * f2 * pi * 2)) / 2 * scale))
log.info(
"Generated {0}ms tone of {1} samples with F1: {2} F2: {3}".format(_duration_in_ms, number_of_samples, f1,
f2))
return result
def create_dtmf_wave_file(self, input_sequence, file_path, dump_to_csv=False):
"""
A convenience method. Validates and assigns a dtmf_sequence, then generates data and saves to a .wav
:param input_sequence: list of lists or tuples of the form [['A', 100], ['S', 50], ['2', 100], ['S', 50]] or json_string of the form '[["A", 100], ["S", 50], ["2", 100], ["S", 50]]'
:param file_path: the full path of the wav file that will be saved
"""
self._dtmf_sequence = input_sequence
self.generate_raw_data()
try:
os.remove('dtmf_dump.csv')
except:
pass # file doesn't exist
if dump_to_csv:
with open('dtmf_dump.csv', 'w') as f:
for d in self._raw_data:
f.write(str(d))
f.write(",")
self.save_wave_file(file_path)
def read_wav(self):
fin = open('testNum.wav','r')
n = fin.getnframes()
d = fin.readframes(n)
fin.close()
data = []
for i in range(n):
#LS8bit = inv_endian(ord(d[2*i]))
#MS8bit = inv_endian(ord(d[2*i+1]))
LS8bit, MS8bit = ord(d[2*i]),ord(d[2*i+1])
data.append((MS8bit<<8)+LS8bit)
return data
# Decoder takes a DTMF signal file (.wav), sampled at 44,000
# 16-bit samples per second, and decode the corresponding symbol X.
def decoder(self):
data = self.read_wav()
temp = []
for f1 in ROW_FREQ:
for f2 in COL_FREQ:
diff = 0
for i in range(SAMPLE_RATE): #assume phase has not shifted dramatically
p = i*1.0/SAMPLE_RATE
S=int(scale+scale*(sin(p*f1*PI2)+sin(p*f2*PI2))/2)
diff += abs(S-data[i])
temp.append((diff,f1,f2))
f1,f2 = min(temp)[1:] #retrieve the frequency of minimum signal distortion
i, j = ROW_FREQ.index(f1), COL_FREQ.index(f2)
X = keys[4*i+j]
print 'Decoded key is: ', X
return X
if __name__ == '__main__':
d = 100
sample_input = [('0', d), ('5', d), ('0', d), ('8', d), ('6', d), ('9', d), ('0',d), ('1',d) , ('8',d),('6',d)]
d = DTMF()
d.create_dtmf_wave_file(sample_input, file_path='testNum.wav', dump_to_csv=True)
x = d.decoder()
fin = open('testNum.wav','r')
Looks like you're using the built-in open function instead of the one from the wave module. Try:
fin = wave.open('testNum.wav','r')
You have to operate with wave read object which could be returned from wave.open, that method would return file with Attribute your code are trying to access.
Also you add new from wave import open statement, so in that way you would overwrite default open method, but it's better to access wave open method thought dot natation as wave.open.

Can I use bisect to print the content of a line?

I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

Is there a library for retrieving a file from a remote zip? [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking us to recommend or find a tool, library or favorite off-site resource are off-topic for Stack Overflow as they tend to attract opinionated answers and spam. Instead, describe the problem and what has been done so far to solve it.
Closed 8 years ago.
Improve this question
The goal is just to retrieve a specific file without downloading the entire contents, using the HTTP range method as described:
http://www.codeproject.com/KB/cs/remotezip.aspx
You can solve this a bit more generally with less code. Essentially, create enough of a file-like object for ZipFile to use. So you wind up with z = ZipFile(HttpFile(url)) and it dynamically downloads just the portion needed. The advantage with this is you write less code, and it applies to more than just zip files. (In fact, I wonder if there is something like this already... I'm not finding it though.)
Using the same idea, you could also create a caching wrapper for HttpFile to avoid repeated downloads.
And here's the code: (note the lack of error-handling)
#!/usr/bin/python
import urllib2
class HttpFile(object):
def __init__(self, url):
self.url = url
self.offset = 0
self._size = -1
def size(self):
if self._size < 0:
f = urllib2.urlopen(self.url)
self._size = int(f.headers["Content-length"])
return self._size
def read(self, count=-1):
req = urllib2.Request(self.url)
if count < 0:
end = self.size() - 1
else:
end = self.offset + count - 1
req.headers['Range'] = "bytes=%s-%s" % (self.offset, end)
f = urllib2.urlopen(req)
data = f.read()
# FIXME: should check that we got the range expected, etc.
chunk = len(data)
if count >= 0:
assert chunk == count
self.offset += chunk
return data
def seek(self, offset, whence=0):
if whence == 0:
self.offset = offset
elif whence == 1:
self.offset += offset
elif whence == 2:
self.offset = self.size() + offset
else:
raise Exception("Invalid whence")
def tell(self):
return self.offset
Since there was no such library I have written a small module myself, most code and logic is is from zipfile with the seek/reads translated to HTTP range requests.
Feel free to review and suggest improvements:
The code:
"""
Read remote ZIP files using HTTP range requests
"""
import struct
import urllib2
import zlib
import cStringIO
from zipfile import ZipInfo, ZipExtFile, ZipInfo
from os.path import join, basename
# The code is mostly adatpted from the zipfile module
# NOTE: ZIP64 is not supported
# The "end of central directory" structure, magic number, size, and indices
# (section V.I in the format document)
structEndArchive = "<4s4H2LH"
stringEndArchive = "PK\005\006"
sizeEndCentDir = struct.calcsize(structEndArchive)
_ECD_SIGNATURE = 0
_ECD_DISK_NUMBER = 1
_ECD_DISK_START = 2
_ECD_ENTRIES_THIS_DISK = 3
_ECD_ENTRIES_TOTAL = 4
_ECD_SIZE = 5
_ECD_OFFSET = 6
_ECD_COMMENT_SIZE = 7
# These last two indices are not part of the structure as defined in the
# spec, but they are used internally by this module as a convenience
_ECD_COMMENT = 8
_ECD_LOCATION = 9
# The "central directory" structure, magic number, size, and indices
# of entries in the structure (section V.F in the format document)
structCentralDir = "<4s4B4HL2L5H2L"
stringCentralDir = "PK\001\002"
sizeCentralDir = struct.calcsize(structCentralDir)
# indexes of entries in the central directory structure
_CD_SIGNATURE = 0
_CD_CREATE_VERSION = 1
_CD_CREATE_SYSTEM = 2
_CD_EXTRACT_VERSION = 3
_CD_EXTRACT_SYSTEM = 4
_CD_FLAG_BITS = 5
_CD_COMPRESS_TYPE = 6
_CD_TIME = 7
_CD_DATE = 8
_CD_CRC = 9
_CD_COMPRESSED_SIZE = 10
_CD_UNCOMPRESSED_SIZE = 11
_CD_FILENAME_LENGTH = 12
_CD_EXTRA_FIELD_LENGTH = 13
_CD_COMMENT_LENGTH = 14
_CD_DISK_NUMBER_START = 15
_CD_INTERNAL_FILE_ATTRIBUTES = 16
_CD_EXTERNAL_FILE_ATTRIBUTES = 17
_CD_LOCAL_HEADER_OFFSET = 18
# The "local file header" structure, magic number, size, and indices
# (section V.A in the format document)
structFileHeader = "<4s2B4HL2L2H"
stringFileHeader = "PK\003\004"
sizeFileHeader = struct.calcsize(structFileHeader)
_FH_SIGNATURE = 0
_FH_EXTRACT_VERSION = 1
_FH_EXTRACT_SYSTEM = 2
_FH_GENERAL_PURPOSE_FLAG_BITS = 3
_FH_COMPRESSION_METHOD = 4
_FH_LAST_MOD_TIME = 5
_FH_LAST_MOD_DATE = 6
_FH_CRC = 7
_FH_COMPRESSED_SIZE = 8
_FH_UNCOMPRESSED_SIZE = 9
_FH_FILENAME_LENGTH = 10
_FH_EXTRA_FIELD_LENGTH = 11
def _http_get_partial_data(url, start_range, end_range=None):
req = urllib2.Request(url)
range_header = "bytes=%s" % start_range
if end_range is not None:
range_header += "-%s" % end_range
req.headers['Range'] = range_header
f = urllib2.urlopen(req)
return f
def _EndRecData(url):
"""Return data from the "End of Central Directory" record, or None.
The data is a list of the nine items in the ZIP "End of central dir"
record followed by a tenth item, the file seek offset of this record."""
ECD = _http_get_partial_data(url, -sizeEndCentDir)
content_range = ECD.headers.get('Content-Range')
filesize = int(content_range.split('/')[1]) if content_range and '/' in content_range else 0
data = ECD.read()
ECD.close()
if data[0:4] == stringEndArchive and data[-2:] == "\000\000":
# the signature is correct and there's no comment, unpack structure
endrec = struct.unpack(structEndArchive, data)
endrec = list(endrec)
# Append a blank comment and record start offset
endrec.append("")
endrec.append(filesize - sizeEndCentDir)
return endrec
# Either this is not a ZIP file, or it is a ZIP file with an archive
# comment. Search the end of the file for the "end of central directory"
# record signature. The comment is the last item in the ZIP file and may be
# up to 64K long. It is assumed that the "end of central directory" magic
# number does not appear in the comment.
# Search by retrieving chunks of 256, 1k and 64k
try_ranges = (1 << 8, 1 << 10, 1 << 16)
for check_range in try_ranges:
ECD = _http_get_partial_data(url, -(check_range + sizeEndCentDir))
data = ECD.read()
content_range = ECD.headers.get('Content-Range')
ECD.close()
download_start = content_range.split('-')[0]
start = data.rfind(stringEndArchive)
if start >= 0:
# found the magic number; attempt to unpack and interpret
recData = data[start:start+sizeEndCentDir]
endrec = list(struct.unpack(structEndArchive, recData))
commentSize = endrec[_ECD_COMMENT_SIZE] #as claimed by the zip file
comment = data[start+sizeEndCentDir:start+sizeEndCentDir+commentSize]
endrec.append(comment)
endrec.append(download_start + start)
return endrec
raise IOError
class HTTPZipFile:
def __init__(self, url):
self.url = url
self.NameToInfo = {} # Find file info given name
self.filelist = [] # List of ZipInfo instances for archive
self.pwd = None
self.comment = ''
self.debug = 0
self._RealGetContents()
def _RealGetContents(self):
"""Read in the table of contents for the ZIP file."""
try:
endrec = _EndRecData(self.url)
except IOError:
raise BadZipfile("File is not a zip file")
if not endrec:
raise BadZipfile, "File is not a zip file"
if self.debug > 1:
print endrec
size_cd = endrec[_ECD_SIZE] # bytes in central directory
offset_cd = endrec[_ECD_OFFSET] # offset of central directory
self.comment = endrec[_ECD_COMMENT] # archive comment
# "concat" is zero, unless zip was concatenated to another file
concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
#if endrec[_ECD_SIGNATURE] == stringEndArchive64:
# # If Zip64 extension structures are present, account for them
# concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
if self.debug > 2:
inferred = concat + offset_cd
print "given, inferred, offset", offset_cd, inferred, concat
# self.start_dir: Position of start of central directory
self.start_dir = offset_cd + concat
ECD = _http_get_partial_data(self.url, self.start_dir, self.start_dir+size_cd-1)
data = ECD.read()
ECD.close()
fp = cStringIO.StringIO(data)
total = 0
while total < size_cd:
centdir = fp.read(sizeCentralDir)
if centdir[0:4] != stringCentralDir:
raise BadZipfile, "Bad magic number for central directory"
centdir = struct.unpack(structCentralDir, centdir)
if self.debug > 2:
print centdir
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
# Create ZipInfo instance to store file information
x = ZipInfo(filename)
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
(x.create_version, x.create_system, x.extract_version, x.reserved,
x.flag_bits, x.compress_type, t, d,
x.CRC, x.compress_size, x.file_size) = centdir[1:12]
x.volume, x.internal_attr, x.external_attr = centdir[15:18]
# Convert date/time code to (year, month, day, hour, min, sec)
x._raw_time = t
x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
x._decodeExtra()
x.header_offset = x.header_offset + concat
x.filename = x._decodeFilename()
self.filelist.append(x)
self.NameToInfo[x.filename] = x
# update total bytes read from central directory
total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
+ centdir[_CD_EXTRA_FIELD_LENGTH]
+ centdir[_CD_COMMENT_LENGTH])
if self.debug > 2:
print "total", total
def namelist(self):
"""Return a list of file names in the archive."""
l = []
for data in self.filelist:
l.append(data.filename)
return l
def infolist(self):
"""Return a list of class ZipInfo instances for files in the
archive."""
return self.filelist
def printdir(self):
"""Print a table of contents for the zip file."""
print "%-46s %19s %12s" % ("File Name", "Modified ", "Size")
for zinfo in self.filelist:
date = "%d-%02d-%02d %02d:%02d:%02d" % zinfo.date_time[:6]
print "%-46s %s %12d" % (zinfo.filename, date, zinfo.file_size)
def getinfo(self, name):
"""Return the instance of ZipInfo given 'name'."""
info = self.NameToInfo.get(name)
if info is None:
raise KeyError(
'There is no item named %r in the archive' % name)
return info
def open(self, name, pwd=None):
"""Return file-like object for 'name'."""
if not self.url:
raise RuntimeError, \
"Attempt to read ZIP archive that was already closed"
zinfo = self.getinfo(name)
offset = zinfo.header_offset
f = _http_get_partial_data(self.url, offset, offset+sizeFileHeader-1)
fheader = f.read()
f.close()
fheader = struct.unpack(structFileHeader, fheader)
offset += sizeFileHeader
f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_FILENAME_LENGTH]-1)
fname = f.read()
f.close()
if fname != zinfo.orig_filename:
raise BadZipfile, \
'File name in directory "%s" and header "%s" differ.' % (
zinfo.orig_filename, fname)
is_encrypted = zinfo.flag_bits & 0x1
if is_encrypted:
raise RuntimeError, "File %s is encrypted, " \
"not supported." % name
offset += fheader[_FH_FILENAME_LENGTH]+fheader[_FH_EXTRA_FIELD_LENGTH]
f = _http_get_partial_data(self.url, offset, offset+fheader[_FH_COMPRESSED_SIZE]-1)
data = f.read()
return ZipExtFile(cStringIO.StringIO(data), 'r', zinfo)
if __name__ == "__main__":
# Some tests
link="http://dfn.dl.sourceforge.net/project/filezilla/FileZilla_Client/3.5.1/FileZilla_3.5.1_win32.zip"
hzfile = HTTPZipFile(link)
hzfile.printdir()
for fname in ('GPL.html', 'resources/blukis/48x48/filter.png', 'resources/finished.wav'):
source_name = join('FileZilla-3.5.1', fname)
dest_fname = join('/tmp', basename(fname))
print "Extracing %s to %s" % (source_name, dest_fname)
with hzfile.open(source_name) as f:
data = f.read()
new_file = open(dest_fname, 'w')
new_file.write(data)
new_file.close()

Categories

Resources