Attribute error while reading wav file - python

I am new in python and i try to build a program that know to decode and encode dual-tone multiple-frequency (DTMF) signals used to dial a telephone.
for now the encoding part is working good but for some reason the encoding is not working and i get the follwoing exception
Traceback (most recent call last):
File "C:\Users\matant\workspace\dialer2\dialer.py", line 239, in <module>
x = d.decoder()
File "C:\Users\matant\workspace\dialer2\dialer.py", line 218, in decoder
data = self.read_wav()
File "C:\Users\matant\workspace\dialer2\dialer.py", line 201, in read_wav
n = fin.getnframes()
AttributeError: 'file' object has no attribute 'getnframes'
as you can see i writing frames into the file so i dont understand why its happend:
this is my code:
'''
Created on Jan 10, 2016
#author: matant
'''
import json
from math import pi, sin
import wave
import logging
import struct
import os
ROW_FREQ = (697, 770, 852, 941)
COL_FREQ = (1209, 1336, 1477, 1633)
SAMPLE_RATE = 44100
SAMPLE_WIDTH = 2
NUMBER_OF_CHANNELS = 1
COMPRESSION_TYPE = "NONE"
COMPRESSION_NAME = "Uncompressed"
PI2 = 6.283185306
scale = 32767 #16-bit unsigned short
keys= '1','2','3','A',\
'4','5','6','B',\
'7','8','9','C',\
'*','0','#','D'
FREQUENCY_MAP = dict()
FREQUENCY_MAP['1'] = (697, 1209)
FREQUENCY_MAP['2'] = (697, 1336)
FREQUENCY_MAP['3'] = (697, 1477)
FREQUENCY_MAP['A'] = (697, 1633)
FREQUENCY_MAP['4'] = (770, 1209)
FREQUENCY_MAP['5'] = (770, 1336)
FREQUENCY_MAP['6'] = (770, 1477)
FREQUENCY_MAP['B'] = (770, 1633)
FREQUENCY_MAP['7'] = (852, 1209)
FREQUENCY_MAP['8'] = (852, 1336)
FREQUENCY_MAP['9'] = (852, 1477)
FREQUENCY_MAP['C'] = (852, 1633)
FREQUENCY_MAP['*'] = (941, 1209)
FREQUENCY_MAP['0'] = (941, 1336)
FREQUENCY_MAP['#'] = (941, 1477)
FREQUENCY_MAP['D'] = (941, 1633)
FREQUENCY_MAP['S'] = (0, 0)
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s <%(levelname)s> %(module)s.%(funcName)s() %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
log = logging.getLogger(__name__)
class DTMF:
VALID_SEQUENCE_TYPES = [list, tuple, set]
def __init__(self, input_string=None, input_list=None):
"""
Initializes a DTMF instance with an option DTMF sequence. This can be a list of lists or a json string.
If both are supplied, it tries to parse the json_string. If it does, it uses that. If there are errors, it
validates the list and tries to use that. Basically input_string takes precedence.
General workflow would be setting dtmf_sequence and calling generate_raw_data. This data can then be saved to a
.wav file or compressed and saved as other, smaller, file formats.
:param input_list: list of lists or tuples of the form [['A', 100], ['S', 50], ['2', 100], ['S', 50]]
:param input_string: json_string of the form '[["A", 100], ["S", 50], ["2", 100], ["S", 50]]'
"""
log.debug("Creating instance of DTMF")
log.debug("input_string = {}".format(input_string))
log.debug("input_list = {}".format(input_list))
self._dtmf_sequence = None
self._raw_data = None
if input_string is not None:
converted_json_sequence = self.parse_json_string(input_string)
self._dtmf_sequence = converted_json_sequence
elif input_list is not None:
self._dtmf_sequence = input_list
#property
def dtmf_sequence(self):
return self._dtmf_sequence
#dtmf_sequence.setter
def dtmf_sequence(self, input_sequence):
if type(input_sequence) == str:
input_sequence = self.parse_json_string(input_sequence)
if type(input_sequence) == list:
if self._dtmf_sequence_is_valid(input_sequence):
self._dtmf_sequence = input_sequence
log.debug("Set _dtmf_sequence to {}".format(self._dtmf_sequence))
def parse_json_string(self, input_string):
return json.loads(input_string)
def generate_raw_data(self):
"""
Generates raw data that can be saved into a .wav file. This can take some time to generate.
:raise AttributeError: If no dtmf sequence has been set
"""
_data = list()
if self._dtmf_sequence is None:
raise AttributeError("No dtmf sequence set")
for tone_tuple in self._dtmf_sequence:
key = tone_tuple[0]
tone_duration = tone_tuple[1]
f1 = FREQUENCY_MAP[key][0]
f2 = FREQUENCY_MAP[key][1]
_data += (self.generate_tone(f1, f2, tone_duration))
self._raw_data = _data
def save_wave_file(self, file_path):
if self._raw_data is None or len(self._raw_data) < 1:
self.generate_raw_data()
f = wave.open(file_path, 'w')
f.setnchannels(NUMBER_OF_CHANNELS)
f.setsampwidth(SAMPLE_WIDTH)
f.setframerate(SAMPLE_RATE)
f.setnframes(len(self._raw_data))
f.setcomptype(COMPRESSION_TYPE, COMPRESSION_NAME)
log.info("Saving wav file {} THIS MAY TAKE A WHILE".format(file_path))
for i in self._raw_data:
f.writeframes(struct.pack('i', i))
log.info("Saved file to {0}".format(file_path))
f.close()
#staticmethod
def dtmf_sequence_is_valid(input_list):
"""
Validates an input sequence for proper structure and contents.
:param input_list:
:return:
"""
if type(input_list) is not list:
log.warning('input_list must be a list instance')
return False
if [(type(item) in DTMF.VALID_SEQUENCE_TYPES) for item in input_list].count(False) != 0:
log.warning('input_list contains invalid sequence type')
return False
for item in input_list:
if type(item[0]) != str or type(item[1]) != int:
log.debug("Type list[0]: {}".format(type(item[0])))
log.debug("Type list[1]: {}".format(type(item[1])))
log.warning('input_list must contain a list of sequences of [str, int]')
return False
return True
#staticmethod
def generate_tone(f1, f2, _duration_in_ms):
"""
Generates a single value representing a sample of two combined frequencies.
:param f1:
:param f2:
:param _duration_in_ms:
:return:
"""
assert f1 in ROW_FREQ or f1 == 0
assert f2 in COL_FREQ or f2 == 0
number_of_samples = int(SAMPLE_RATE * _duration_in_ms / 1000)
scale = 32767 # signed int / 2
result = list()
for i in range(number_of_samples):
p = i * 1.0 / SAMPLE_RATE
result.append(int((sin(p * f1 * pi * 2) + sin(p * f2 * pi * 2)) / 2 * scale))
log.info(
"Generated {0}ms tone of {1} samples with F1: {2} F2: {3}".format(_duration_in_ms, number_of_samples, f1,
f2))
return result
def create_dtmf_wave_file(self, input_sequence, file_path, dump_to_csv=False):
"""
A convenience method. Validates and assigns a dtmf_sequence, then generates data and saves to a .wav
:param input_sequence: list of lists or tuples of the form [['A', 100], ['S', 50], ['2', 100], ['S', 50]] or json_string of the form '[["A", 100], ["S", 50], ["2", 100], ["S", 50]]'
:param file_path: the full path of the wav file that will be saved
"""
self._dtmf_sequence = input_sequence
self.generate_raw_data()
try:
os.remove('dtmf_dump.csv')
except:
pass # file doesn't exist
if dump_to_csv:
with open('dtmf_dump.csv', 'w') as f:
for d in self._raw_data:
f.write(str(d))
f.write(",")
self.save_wave_file(file_path)
def read_wav(self):
fin = open('testNum.wav','r')
n = fin.getnframes()
d = fin.readframes(n)
fin.close()
data = []
for i in range(n):
#LS8bit = inv_endian(ord(d[2*i]))
#MS8bit = inv_endian(ord(d[2*i+1]))
LS8bit, MS8bit = ord(d[2*i]),ord(d[2*i+1])
data.append((MS8bit<<8)+LS8bit)
return data
# Decoder takes a DTMF signal file (.wav), sampled at 44,000
# 16-bit samples per second, and decode the corresponding symbol X.
def decoder(self):
data = self.read_wav()
temp = []
for f1 in ROW_FREQ:
for f2 in COL_FREQ:
diff = 0
for i in range(SAMPLE_RATE): #assume phase has not shifted dramatically
p = i*1.0/SAMPLE_RATE
S=int(scale+scale*(sin(p*f1*PI2)+sin(p*f2*PI2))/2)
diff += abs(S-data[i])
temp.append((diff,f1,f2))
f1,f2 = min(temp)[1:] #retrieve the frequency of minimum signal distortion
i, j = ROW_FREQ.index(f1), COL_FREQ.index(f2)
X = keys[4*i+j]
print 'Decoded key is: ', X
return X
if __name__ == '__main__':
d = 100
sample_input = [('0', d), ('5', d), ('0', d), ('8', d), ('6', d), ('9', d), ('0',d), ('1',d) , ('8',d),('6',d)]
d = DTMF()
d.create_dtmf_wave_file(sample_input, file_path='testNum.wav', dump_to_csv=True)
x = d.decoder()

fin = open('testNum.wav','r')
Looks like you're using the built-in open function instead of the one from the wave module. Try:
fin = wave.open('testNum.wav','r')

You have to operate with wave read object which could be returned from wave.open, that method would return file with Attribute your code are trying to access.
Also you add new from wave import open statement, so in that way you would overwrite default open method, but it's better to access wave open method thought dot natation as wave.open.

Related

i am trying to decode a Huffman tree but i am having some problems

i have created and encoded a Huffman tree and put it in a binary file but now when i try to decode it i am getting this error:
line 28, in
Huffman_Decoding(data, tree)
File "/Users/georgesmac/Desktop/python_course/decompress.py", line 12, in Huffman_Decoding
for x in encoded_data:
TypeError: 'node' object is not iterable
the decode code:
import pickle
with open("binary.bin", "rb") as file_handle:
data = pickle.load(file_handle)
file_handle.seek(0)
tree = pickle.load(file_handle)
def Huffman_Decoding(encoded_data, huffman_tree):
tree_head = huffman_tree
decoded_output = []
for x in encoded_data:
if x == '1':
huffman_tree = huffman_tree.right
elif x == '0':
huffman_tree = huffman_tree.left
try:
if huffman_tree.left.symbol == None and huffman_tree.right.symbol == None:
pass
except AttributeError:
decoded_output.append(huffman_tree.symbol)
huffman_tree = tree_head
string = ''.join([str(item) for item in decoded_output])
return string
Huffman_Decoding(data, tree)
the encode code:
import pickle
q = {}
a_file = open("george.txt", 'r')
for line in a_file:
key, value = line.split()
q[key] = value
class node:
def __init__(self, freq, symbol, left=None, right=None):
self.freq = freq
self.symbol = symbol
self.left = left
self.right = right
self.huff = ''
def printNodes(node, val=''):
newVal = val + str(node.huff)
if(node.left):
printNodes(node.left, newVal)
if(node.right):
printNodes(node.right, newVal)
if(not node.left and not node.right):
print(f"{node.symbol} -> {newVal}")
chars = ['a', 'b', 'c', 'd', 'e', 'f']
# frequency of characters
freq = [q['a'], q['b'], q['c'], q['d'], q['e'], q['f']]
nodes = []
for x in range(len(chars)):
nodes.append(node(freq[x], chars[x]))
while len(nodes) > 1:
nodes = sorted(nodes, key=lambda x: x.freq)
left = nodes[0]
right = nodes[1]
left.huff = 0
right.huff = 1
newNode = node(left.freq+right.freq, left.symbol+right.symbol, left, right)
nodes.remove(left)
nodes.remove(right)
nodes.append(newNode)
printNodes(nodes[0])
with open('binary.bin', 'wb') as f:
b = pickle.dumps(nodes[0]) # bytes representation of your object
f.write(b)
You pickled objects of type node. Python to unpickle them into objects needs node class. Since you created it with class node defined in main module that is where python looks for it.
You should either import the node class or define it in decompress.py.
More clean solution would be to create a module specifically for node class and in both, pickling and unplickling code import it from there.

python random data generator expected str instance, numpy.datetime64 found

Hello have been trying to create random data with random dates as into a csv file but getting the following error expected str instance, numpy.datetime64 found
code for data generator
import pandas as pd
import numpy as np
import string
import random
def gen_random_email():
domains = [ "hotmail.com", "gmail.com", "aol.com", "mail.com" , "mail.kz", "yahoo.com"]
letters = string.ascii_letters +'.'*5
email = ''.join(np.random.choice(list(letters),10))+'#'+ np.random.choice(domains)
email = email.replace('.#', '#')
return email, "Email"
def gen_random_float():
num = np.random.random()*np.random.randint(2000)
decimal_points = np.random.randint(8)
num = int(num*10**(decimal_points))/10**decimal_points
return str(num), 'Float'
def gen_random_sentence():
nouns = ["puppy", "car", "rabbit", "girl", "monkey"]
verbs = ["runs", "hits", "jumps", "drives", "barfs"]
adv = ["crazily", "dutifully", "foolishly", "merrily", "occasionally"]
adj = ["adorable.", "clueless.", "dirty.", "odd.", "stupid."]
random_entry = lambda x: x[random.randrange(len(x))]
random_entry = " ".join([random_entry(nouns), random_entry(verbs),
random_entry(adv), random_entry(adj)])
return random_entry, 'String'
def gen_random_int():
num = np.random.randint(1000000)
return str(num), 'Int'
def gen_random_date():
monthly_days = np.arange(0, 30)
base_date = np.datetime64('2020-01-01')
random_date = base_date + np.random.choice(monthly_days)
return random_date, 'Date'
def gen_dataset(filename, size=5000):
randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
with open(filename, 'w') as file:
file.write("Text, Type\n")
for _ in range(size):
file.write(",".join(random.choice(randomizers)())+"\n")
gen_dataset('dataaaa.csv')
TypeError: sequence item 0: expected str instance, numpy.datetime64 found
First, catch the error and see what is causing it.
def gen_dataset(filename, size=5000):
randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
with open(filename, 'w') as file:
file.write("Text, Type\n")
for _ in range(size):
f = random.choice(randomizers)
result = f()
try:
file.write(",".join(result)+"\n")
except TypeError:
print(result)
raise
>>>
(numpy.datetime64('2020-01-09'), 'Date')
Traceback (most recent call last):
File "C:\pyProjects\tmp.py", line 80, in <module>
gen_dataset('dataaaa.csv')
File "C:\pyProjects\tmp.py", line 75, in gen_dataset
file.write(",".join(result)+"\n")
TypeError: sequence item 0: expected str instance, numpy.datetime64 found
hmmm, I wonder if join only except strings as arguments?
Yep, from the docs:
A TypeError will be raised if there are any non-string values in iterable, including bytes objects.
I wonder how I can turn a numpy datetime64 to a string. Searching with numpy datetime64 to string is productive: Convert numpy.datetime64 to string object in python
These work
>>> q = gen_random_date()[0]
>>> q
numpy.datetime64('2020-01-27')
>>> np.datetime_as_string(q)
'2020-01-27'
>>> q.astype(str)
'2020-01-27'
>>>
Then just modify the try/except.
def gen_dataset(filename, size=5000):
randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
with open(filename, 'w') as file:
file.write("Text, Type\n")
for _ in range(size):
f = random.choice(randomizers)
a,b = f()
try:
q = ",".join([a,b,"\n"])
except TypeError:
a = np.datetime_as_string(a)
q = ",".join([a,b,"\n"])
file.write(q)
Or simply preemptively make the first item a string.
def gen_dataset(filename, size=5000):
randomizers = [gen_random_email, gen_random_float, gen_random_int, gen_random_sentence,gen_random_date]
with open(filename, 'w') as file:
file.write("Text, Type\n")
for _ in range(size):
f = random.choice(randomizers)
a,b = f()
q = ",".join([str(a),b,"\n"])
file.write(q)

Too many values to unpack python gbf file

Hello i got python error too many values to unpack on python when i run my script
Traceback:
File "C:\Python27.1\perpetuum.py", line 193, in __init__
header_offset, header_length = struct.unpack('8sii', f.read(16))
ValueError: too many values to unpack.
Code:
class DataFile(object):
"""GBF file reader"""
def __init__(self, filename):
self.filename = filename
with open(filename, 'rb') as f:
header_offset, header_length = struct.unpack('8sii', f.read(16))
if magic != 'GXYDATA\x1a':
raise Exception('Invalid data file (wrong magic)', magic)
header = self._get_record(header_offset, header_length)
self._records = self._parse_header(header)
def _decode(self, data):
try:
import numpy as np
i = np.arange(len(data), dtype=np.byte)
buf = np.frombuffer(data, np.byte) ^ ((i + 1) * (i ^ -54) - 84)
return buf.tostring()
except ImportError:
buf = array.array('B', data)
for i in xrange(len(data)):
buf[i] = 0xff & (buf[i] ^ ((i + 1) * (i ^ 0xca) - 84))
return buf.tostring()
def _get_record(self, offset, length):
with open(self.filename, 'rb') as f:
f.seek(offset)
data = f.read(length)
return self._decode(data)
def _parse_header(self, header):
"""
header record format:
int numRecords
for each record:
char[nameLen] nameSZ, 0
int offset
int length
int unkLength
"""
records = {}
num_records = struct.unpack_from('i', header)[0]
pos = 4
for i in xrange(num_records):
name_end = header.find('\0', pos)
name = header[pos:name_end]
pos = name_end + 1
offset, length, unkLength = struct.unpack_from('iii', header, pos)
pos += 12
# f1Length = min(13, unkLength)
# f1 = header[pos:pos+f1Length]
pos += unkLength
records[name] = (offset, length)
return records
PREFIX_MAP = {'\x89PNG': '.png',
'DDS ': '.dds',
'A3MF': '.a3m',
'#': '.txt',
'=': '.txt',
'Extended Module': '.xm',
'RIFF': '.wav',
'OggS': '.ogg'}
def _guess_ext(self, name, data):
for prefix, ext in self.PREFIX_MAP.iteritems():
if data.startswith(prefix):
return ext
return '.bin'
CATEGORY_MAP = OrderedDict([
('def*.png', 'icons'),
('icon*.png', 'icons'),
('entityIcon*.png', 'icons'),
('noIcon*.png', 'icons'),
('gfx_*.png', 'gfx'),
('*.a3m', 'models'),
('snd_*', 'sound'),
('altitude*', 'terrain'),
('terrain*', 'terrain'),
('altitude0*', 'terrain'),
('blocks0*', 'terrain'),
('control0*', 'terrain'),
('plants0*', 'terrain'),
('surface0*', 'terrain'),
('tactical*.png', 'tactical_icons'),
('font*', 'font'),
('textures_*.dds', 'textures'),
('corp*.png', 'corp_icons'),
('credits.txt', 'misc'),
('eula*.txt', 'misc'),
('*.txt', 'text_data')])
def dump_record(self, name, dest_dir, sort=False):
offset, length = self._records[name]
print '%08x: %s (%.2f KB)' % (offset, name, length / 1024.)
data = self._get_record(offset, length)
name += self._guess_ext(name, data)
if sort:
for pattern, category in self.CATEGORY_MAP.iteritems():
if fnmatch.fnmatch(name, pattern):
dest_dir = os.path.join(dest_dir, category)
try:
os.makedirs(dest_dir)
except OSError:
pass
break
rec_filename = os.path.join(dest_dir, name)
with open(rec_filename, 'wb') as f:
f.write(data)
def dump_records(self, patterns, dest_dir, sort=False):
for name in self._records:
if any(fnmatch.fnmatch(name, pattern) for pattern in patterns):
self.dump_record(name, dest_dir, sort)
any ideas ?
The code
struct.unpack('8sii', f.read(16))
unpacks into three values - a 8-byte string, and two integers each of which has length of 4 bytes, while your LHS has only two variables.
See the struct documentation for its format string.
This line is making an incorrect assumption:
header_offset, header_length = struct.unpack('8sii', f.read(16))
You are probably better off checking the size of the tuple returned from struct.unpack and then dealing with the results in a conditional manner.
Note that according to the docs, this method always returns a tuple, even if that tuple is of length one.
Stupid i remove if magic, before header offset.
drhagen stupidity of human take -1 and no answer at all get my skill at photoshop and I could laugh of you any time

While conversion, invalid literal for float

I'm trying to train my own data on the Yolo network, but before that I have to convert the bounding boxes co-ordinates to the form it wants.
The file contents are like this:
0
53 19 163 116
and I'm trying to convert it to the form the network works with the following.
The code is:
import os
from os import walk, getcwd
from PIL import Image
classes = ["stopsign"]
def convert(size, box):
dw = 1./size[0]
dh = 1./size[1]
x = (box[0] + box[1])/2.0
y = (box[2] + box[3])/2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x*dw
w = w*dw
y = y*dh
h = h*dh
return (x,y,w,h)
"""-------------------------------------------------------------------
"""
""" Configure Paths"""
mypath = "/home/decentmakeover2/BBox-Label-Tool/Labels/002/"
outpath = "/home/decentmakeover2/output/"
cls = "stopsign"
if cls not in classes:
exit(0)
cls_id = classes.index(cls)
wd = getcwd()
list_file = open('%s/%s_list.txt'%(wd, cls), 'w')
""" Get input text file list """
txt_name_list = []
for (dirpath, dirnames, filenames) in walk(mypath):
txt_name_list.extend(filenames)
break
print(txt_name_list)
""" Process """
for txt_name in txt_name_list:
#txt_file = open("Labels/stop_sign/001.txt", "r")
""" Open input text files """
txt_path = mypath + txt_name
print("Input:" + txt_path)
txt_file = open(txt_path, "r")
lines = txt_file.read().split('\r\n') #for ubuntu, use "\r\n"
instead of "\n"
""" Open output text files """
txt_outpath = outpath + txt_name
print("Output:" + txt_outpath)
txt_outfile = open(txt_outpath, "w")
""" Convert the data to YOLO format """
ct = 0
for line in lines:
#print('lenth of line is: ')
#print(len(line))
#print('\n')
if(len(line) >= 2):
ct = ct + 1
print(line + "\n")
elems = line.split(' ')
print(elems)
xmin = elems[0]
xmax = elems[2]
ymin = elems[1]
ymax = elems[3]
#
img_path = str('%s/images/%s/%s.JPEG'%(wd, cls,
os.path.splitext(txt_name)[0]))
#t = magic.from_file(img_path)
#wh= re.search('(\d+) x (\d+)', t).groups()
im=Image.open(img_path)
w= int(im.size[0])
h= int(im.size[1])
#w = int(xmax) - int(xmin)
#h = int(ymax) - int(ymin)
# print(xmin)
print(w, h)
b = (float(xmin), float(xmax), float(ymin), float(ymax))
bb = convert((w,h), b)
print(bb)
txt_outfile.write(str(cls_id) + " " + " ".join([str(a) for
a
in bb]) + '\n')
""" Save those images with bb into list"""
if(ct != 0):
list_file.write('%s/images/%s/%s.JPEG\n'%(wd, cls,
os.path.splitext(txt_name)[0]))
list_file.close()
and i get the error:
first it prints out all the file names and the content of the data then,
['0\n53', '19', '163', '116\n']
(262, 192)
Traceback (most recent call last):
File "text.py", line 84, in <module>
b = (float(xmin), float(xmax), float(ymin), float(ymax))
ValueError: invalid literal for float(): 0
53
I'm not really sure what to do here.
Any suggestions?
As seen in the error message your first term is '0\n53' where as it should be '0' followed by '53'. thus it isn't detected as a float. just splitting with '\n' should work.

Windows/Python Error WindowsError: [Error 3] The system cannot find the path specified

Hi I am new to python and i need some help. I trying to run a file on Windows 10 OS with python 2.7.
import os
import re
import codecs
import numpy as np
import theano
models_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_script = os.path.join(eval_path, "conlleval")
def get_name(parameters):
"""
Generate a model name from its parameters.
"""
l = []
for k, v in parameters.items():
if type(v) is str and "/" in v:
l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
else:
l.append((k, v))
name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
return "".join(i for i in name if i not in "\/:*?<>|")
def set_values(name, param, pretrained):
"""
Initialize a network parameter with pretrained values.
We check that sizes are compatible.
"""
param_value = param.get_value()
if pretrained.size != param_value.size:
raise Exception(
"Size mismatch for parameter %s. Expected %i, found %i."
% (name, param_value.size, pretrained.size)
)
param.set_value(np.reshape(
pretrained, param_value.shape
).astype(np.float32))
def shared(shape, name):
"""
Create a shared object of a numpy array.
"""
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
return theano.shared(value=value.astype(theano.config.floatX), name=name)
def create_dico(item_list):
"""
Create a dictionary of items from a list of list of items.
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def iob2(tags):
"""
Check that tags have a valid IOB format.
Tags in IOB1 format are converted to IOB2.
"""
for i, tag in enumerate(tags):
if tag == 'O':
continue
split = tag.split('-')
if len(split) != 2 or split[0] not in ['I', 'B']:
return False
if split[0] == 'B':
continue
elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
return True
def iob_iobes(tags):
"""
IOB -> IOBES
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
elif tag.split('-')[0] == 'B':
if i + 1 != len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif tag.split('-')[0] == 'I':
if i + 1 < len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise Exception('Invalid IOB format!')
return new_tags
def iobes_iob(tags):
"""
IOBES -> IOB
"""
new_tags = []
for i, tag in enumerate(tags):
if tag.split('-')[0] == 'B':
new_tags.append(tag)
elif tag.split('-')[0] == 'I':
new_tags.append(tag)
elif tag.split('-')[0] == 'S':
new_tags.append(tag.replace('S-', 'B-'))
elif tag.split('-')[0] == 'E':
new_tags.append(tag.replace('E-', 'I-'))
elif tag.split('-')[0] == 'O':
new_tags.append(tag)
else:
raise Exception('Invalid format!')
return new_tags
def insert_singletons(words, singletons, p=0.5):
"""
Replace singletons by the unknown word with a probability p.
"""
new_words = []
for word in words:
if word in singletons and np.random.uniform() < p:
new_words.append(0)
else:
new_words.append(word)
return new_words
def pad_word_chars(words):
"""
Pad the characters of the words in a sentence.
Input:
- list of lists of ints (list of words, a word being a list of char indexes)
Output:
- padded list of lists of ints
- padded list of lists of ints (where chars are reversed)
- list of ints corresponding to the index of the last character of each word
"""
max_length = max([len(word) for word in words])
char_for = []
char_rev = []
char_pos = []
for word in words:
padding = [0] * (max_length - len(word))
char_for.append(word + padding)
char_rev.append(word[::-1] + padding)
char_pos.append(len(word) - 1)
return char_for, char_rev, char_pos
def create_input(data, parameters, add_label, singletons=None):
"""
Take sentence data and return an input for
the training or the evaluation function.
"""
words = data['words']
chars = data['chars']
if singletons is not None:
words = insert_singletons(words, singletons)
if parameters['cap_dim']:
caps = data['caps']
char_for, char_rev, char_pos = pad_word_chars(chars)
input = []
if parameters['word_dim']:
input.append(words)
if parameters['char_dim']:
input.append(char_for)
if parameters['char_bidirect']:
input.append(char_rev)
input.append(char_pos)
if parameters['cap_dim']:
input.append(caps)
if add_label:
input.append(data['tags'])
return input
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
id_to_tag, dictionary_tags, eval_id):
"""
Evaluate current model using CoNLL script.
"""
n_tags = len(id_to_tag)
predictions = []
count = np.zeros((n_tags, n_tags), dtype=np.int32)
for raw_sentence, data in zip(raw_sentences, parsed_sentences):
input = create_input(data, parameters, False)
if parameters['crf']:
y_preds = np.array(f_eval(*input))[1:-1]
else:
y_preds = f_eval(*input).argmax(axis=1)
y_reals = np.array(data['tags']).astype(np.int32)
assert len(y_preds) == len(y_reals)
p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
r_tags = [id_to_tag[y_real] for y_real in y_reals]
if parameters['tag_scheme'] == 'iobes':
p_tags = iobes_iob(p_tags)
r_tags = iobes_iob(r_tags)
for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
predictions.append(new_line)
count[y_real, y_pred] += 1
predictions.append("")
# Write predictions to disk and run CoNLL script externally
#eval_id = np.random.randint(1000000, 2000000)
output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
with codecs.open(output_path, 'w', 'utf8') as f:
f.write("\n".join(predictions))
os.system("%s < %s > %s" % (eval_script, output_path, scores_path))
# CoNLL evaluation results
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
#trainLog = open('train.log', 'w')
for line in eval_lines:
print line
#trainLog.write("%s\n" % line)
# Remove temp files
# os.remove(output_path)
# os.remove(scores_path)
# Confusion matrix with accuracy for each tag
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
"ID", "NE", "Total",
*([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
)
for i in xrange(n_tags):
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
str(i), id_to_tag[i], str(count[i].sum()),
*([count[i][j] for j in xrange(n_tags)] +
["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
)
# Global accuracy
print "%i/%i (%.5f%%)" % (
count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
)
# F1 on all entities
return float(eval_lines[1].strip().split()[-1])
When i compile the code as it is i always get the error.I think its either because of restriction on path length in windows or it needs or slashes. I dont know what to add to subtract in order to resolve the problem.
run train.py --train lstm/fold1/train --dev lstm/fold1/dev --test lstm/fold1/test
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
Using gpu device 0: GeForce GT 620M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)
Traceback (most recent call last):
File "E:\New-Code\tagger-master\tagger-master\train.py", line 135, in
model = Model(parameters=parameters, models_path=models_path)
File "model.py", line 36, in init
os.makedirs(self.model_path)
File "C:\Users\Acer\Anaconda2\envs\env_name27\lib\os.py", line 157, in makedirs
mkdir(name, mode)
WindowsError: [Error 3] The system cannot find the path specified: './models\tag_scheme=iob,lower=False,zeros=False,char_dim=25,char_lstm_dim=25,char_bidirect=True,word_dim=100,word_lstm_dim=100,word_bidirect=True,pre_emb=,all_emb=False,cap_dim=0,crf=True,dropout=0.3,lr_method=sgd-lr_.005'
In windows pathe is given by back slash \ instead of forward slash / which is used in linux/unix.
Try it like blow if file is 1 folder back:
models_path = "..\models"
eval_path = "..\evaluation"

Categories

Resources