TreeView to JSON in Python - python

[Edit: apparently this file looks similar to h5 format]
I am trying to extract metadata from a file with extension of (.dm3) using hyperspy in Python, I am able to get all the data but it's getting saved in a treeview, but I need the data in Json I tried to make my own parser to convert it which worked for most cases but then failed:
TreeView data generated
Is there a library or package I can use to convert the treeview to JSON in pyhton?
My parser:
def writearray(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + '[')
for char in k[1]:
file.write(char)
file.write(']')
def writenum(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + k[1])
def writestr(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' +'"'+ k[1]+'"')
def startnew(file,string):
file.write('"'+string+'":'+'{\n')
def closenum(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + k[1] + '\n')
file.write('},\n')
def closestr(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + '"' + k[1] + '"' + '\n')
file.write('},\n')
def closearr(file,string):
k = string.split('=')
file.write('"' + k[0] + '":' + '[')
for char in k[1]:
file.write(char)
file.write(']\n')
file.write('},\n')
def strfix(string):
temp = ''
for char in string:
if char != ' ':
temp += char
return temp
def writethis(file,string):
stripped = strfix(string)
if "=" in stripped:
temp = stripped.split("=")
if ',' in temp[1]:
writearray(file,stripped)
elif temp[1].isdigit() or temp[1].isdecimal():
writenum(file,stripped)
else:
writestr(file,stripped)
def createMetaData(dm3file):
txtfile = os.path.splitext(dm3file)[0] + '.txt'
jsonfile = os.path.splitext(dm3file)[0] + '.json'
s = hs.load(dm3file)
s.original_metadata.export(txtfile)
file1 = open(txtfile, 'r', encoding="utf-8")
Lines = file1.readlines()
k = []
for line in Lines:
k.append(line)
L = []
for string in k:
temp = ''
for char in string:
if char.isalpha() or char.isdigit() or char == '=' or char == ' ' or char == '<' or char == '>' or char == ',' or char == '.' or char == '-' or char == ':':
temp += char
L.append(temp)
file2 = open(jsonfile, 'w', encoding="utf-8")
file2.write('{\n')
for i in range(0, len(L) - 1):
currentspaces = len(L[i]) - len(L[i].lstrip())
nextspaces = len(L[i + 1]) - len(L[i + 1].lstrip())
sub = nextspaces - currentspaces
if i != len(L) - 2:
if (sub == 0):
writethis(file2, L[i])
if '=' in L[i]:
file2.write(',\n')
else:
file2.write('\n')
elif sub > 0:
startnew(file2, L[i])
else:
if sub == -3:
writethis(file2, L[i])
file2.write('\n},\n')
elif sub == -7:
writethis(file2, L[i])
file2.write('\n}\n},\n')
else:
writethis(file2, L[i])
file2.write('\n}\n}\n}\n}')
file1.close()
os.remove(txtfile)
enter code here

I wrote a parser for the tree-view format:
from ast import literal_eval
from collections import abc
from more_itertools import peekable
def parse_literal(x: str):
try:
return literal_eval(x)
except Exception:
return x.strip()
def _treeview_parse_list(lines: peekable) -> list:
list_as_dict = {}
for line in (x.strip() for x in lines):
raw_k, raw_v = line.split(' = ')
list_as_dict[int(raw_k.split()[-1][1:-1])] = parse_literal(raw_v)
peek = lines.peek(None)
if '╚' in line or (peek is not None and '├' in peek):
break
list_as_list = [None] * (max(list_as_dict) + 1)
for idx, v in list_as_dict.items():
list_as_list[idx] = v
return list_as_list
def _treeview_parse_dict(lines: peekable) -> dict:
node = {}
for line in (x.strip() for x in lines):
if ' = ' in line:
raw_k, raw_v = line.split(' = ')
node[raw_k.split()[-1]] = parse_literal(raw_v)
elif '<list>' in line:
node[line.split()[-2]] = _treeview_parse_list(lines)
else:
try:
idx = line.index('├')
except ValueError:
idx = line.index('└')
peek = lines.peek(None)
if peek is not None and '├' in peek and idx == peek.index('├'):
node[line.split()[-1]] = {}
else:
node[line.split()[-1]] = _treeview_parse_dict(lines)
if '└' in line:
break
return node
def treeview_to_dict(lines: abc.Iterable) -> dict:
return _treeview_parse_dict(peekable(lines))
Usage:
with open('meta.txt') as f:
d = treeview_to_dict(f)
You can obtain the metadata as a JSON file using Python's built-in json library:
import json
with open('meta.txt') as txt_file:
with open('meta.json', 'w') as json_file:
json.dump(treeview_to_dict(txt_file), json_file, indent=4)
I've added indent=4 to make the JSON file more human-readable, so that you can verify it against the original format. As far as I can tell they match up in a sensible way.
As I've written this, it uses the third-party more_itertools.peekable class. If you can't use more_itertools, it shouldn't be too hard to implement that functionality yourself, or just refactor the code so that it is no longer necessary to look ahead.
License:
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND ON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to https://unlicense.org

A more straightforward approach is to use the as_dictionary method to convert the metadata to a python dictionary, then you can convert it to json.
import hyperspy.api as hs
s = hs.load('file.dm3')
metadata_dictionary = s.original_metadata.as_dictionary()
A different approach is to use the new RosettaSciIO library, which has been split from hyperspy to extract the metadata, for more information see the documentation https://hyperspy.org/rosettasciio/

Related

Attribute Error: 'PdfFileReader' object has no attribute '_checkKids'

I am new in Python, and pretty new in programming too. Any advise would be very helpful.
I used a script to read a pdf file and extract a javascript file to use for form autofilling.
I have installed PyPDF2 module , but I am getting this error
An error occured... :( 'PdfFileReader' object has no attribute '_checkKids'
Here is the code I am using :
import os
import sys
from collections import OrderedDict
from PyPDF2 import PdfFileReader
def _getFields(obj, tree=None, retval=None, fileobj=None):
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU':
'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV':
'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._checkKids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.getObject()
obj._buildField(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfFileReader(open(infile, 'rb'))
fields = _getFields(infile)
return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
def selectListOption(all_lines, k, v):
all_lines.append('function setSelectedIndex(s, v) {')
all_lines.append('for (var i = 0; i < s.options.length; i++) {')
all_lines.append('if (s.options[i].text == v) {')
all_lines.append('s.options[i].selected = true;')
all_lines.append('return;')
all_lines.append('}')
all_lines.append('}')
all_lines.append('}')
all_lines.append('setSelectedIndex(document.getElementById("' + k + '"), "' + v + '");')
def readList(fname):
lst = []
with open(fname, 'r') as fh:
for l in fh:
lst.append(l.rstrip(os.linesep))
return lst
def createBrowserScript(fl, fl_ext, items, pdf_file_name):
if pdf_file_name and len(fl) > 0:
of = os.path.splitext(pdf_file_name)[0] + '.txt'
all_lines = []
for k, v in items.items():
print(k + ' -> ' + v)
if (v in ['/Yes', '/On']):
all_lines.append("document.getElementById('" + k + "').checked = true;\n");
elif (v in ['/0'] and k in fl_ext):
all_lines.append("document.getElementById('" + k + "').checked = true;\n");
elif (v in ['/No', '/Off', '']):
all_lines.append("document.getElementById('" + k + "').checked = false;\n");
elif (v in [''] and k in fl_ext):
all_lines.append("document.getElementById('" + k + "').checked = false;\n");
elif (k in fl):
selectListOption(all_lines, k, v)
else:
all_lines.append("document.getElementById('" + k + "').value = '" + v + "';\n");
outF = open(of, 'w')
outF.writelines(all_lines)
outF.close()
def execute(args):
try:
fl = readList('myview.ini')
fl_ext = readList('myview_ext.ini')
if len(args) == 2:
pdf_file_name = args[1]
items = get_form_fields(pdf_file_name)
createBrowserScript(fl, fl_ext, items, pdf_file_name)
else:
files = [f for f in os.listdir('.') if os.path.isfile(f) and f.endswith('.pdf')]
for f in files:
items = get_form_fields(f)
createBrowserScript(fl, fl_ext, items, f)
except BaseException as msg:
print('An error occured... :( ' + str(msg))
if __name__ == '__main__':
from pprint import pprint
execute(sys.argv)
The error you're seeing is being triggered because you call:
obj._checkKids(tree, retval, fileobj)
_checkKids has an underscore in front of it. This indicates that the function is meant to be used internally within the class and shouldn't be called externally.
In python, this means:
"1. Single Leading Underscore:
When it comes to variable and method names, the single underscore prefix has a meaning by convention only. It’s a hint to the programmer—and it means what the Python community agrees it should mean, but it does not affect the behavior of your programs.
The underscore prefix is meant as a hint to another programmer that a variable or method starting with a single underscore is intended for internal use. This convention is defined in PEP 8.
This isn’t enforced by Python. Python does not have strong distinctions between “private” and “public” variables like Java does. It’s like someone put up a tiny underscore warning sign that says:
“Hey, this isn’t really meant to be a part of the public interface of this class. Best to leave it alone.”"
https://dbader.org/blog/meaning-of-underscores-in-python
My additional recommendations:
I noticed some of your indents are not within the first function that you've written. I would double check all the indents to make sure the code is being included.

PLY using dynamic tokens

I'm writing a program that can parse math papers written in .tex files. Here are what I want:
The program is supposed to detect the beginning, the end, sections, subsections, subsubsections, theorems, lemmas, definitions, conjectures, corollaries, proposition, exercises, notations and examples in a math paper and ignore the rest of the contents to produce a summary.
In the beginning the program is supposed to retain all characters until reaching token MT. In this case the lever should preserve the token and enter ig mode. Then it should ignore all characters unless it detects a theorem/lemma/definition/conjecture/corollary/example/exercise/notation/proposition, in which case it temporarily enters the INITIAL mode and retain it or a (sub/subsub)section in which case it should temporarily enter the sec mode.
\newtheorem{<name>}{<heading>}[<counter>] and \newtheorem{<name>}[<counter>]{<heading>} are detected as TH ptext THCC ptext THC ptext and TH ptext THCS ptext THSC ptext THC respectively where ptext is a bunch of TEXT.
import sys
import logging
from ply.lex import TOKEN
if sys.version_info[0] >= 3:
raw_input = input
tokens = (
'BT', 'BL', 'BD', 'BCONJ', 'BCOR', 'BE', 'ET', 'EL', 'ED', 'ECONJ', 'ECOR', 'EE', 'SEC', 'SSEC', 'SSSEC', 'ES', 'TEXT','ITEXT','BIBS','MT','BN','EN','BEXE','EEXE','BP','EP','TH','THCS','THSC','THCC','THC',
)
states = (('ig', 'exclusive'), ('sec', 'exclusive'), ('th', 'exclusive'), ('tht','exclusive'),('thc','exclusive'))
logging.basicConfig(
level = logging.DEBUG,
filename = "lexlog.txt",
filemode = "w",
format = "%(filename)10s:%(lineno)4d:%(message)s"
)
log = logging.getLogger()
th_temp = ''
thn_temp = ''
term_dic = {'Theorem':'','Lemma':'','Corollary':'','Definition':'','Conjecture':'','Example':'','Exercise':'','Notation':'','Proposition':''}
idb_list = ['','','','','','','','','']
ide_list = ['','','','','','','','','']
bb = r'\\begin\{'
eb = r'\\end\{'
ie = r'\}'
def finalize_terms():
global idb_list
global ide_list
if term_dic['Theorem'] != '':
idb_list[0] = bb + term_dic['Theorem'] + ie
ide_list[0] = eb + term_dic['Theorem'] + ie
if term_dic['Lemma'] != '':
idb_list[1] = bb + term_dic['Lemma'] + ie
ide_list[1] = eb + term_dic['Lemma'] + ie
if term_dic['Corollary'] != '':
idb_list[2] = bb + term_dic['Corollary'] + ie
ide_list[2] = eb + term_dic['Corollary'] + ie
if term_dic['Definition'] != '':
idb_list[3] = bb + term_dic['Definition'] + ie
ide_list[3] = eb + term_dic['Definition'] + ie
if term_dic['Conjecture'] != '':
idb_list[4] = bb + term_dic['Conjecture'] + ie
ide_list[4] = eb + term_dic['Conjecture'] + ie
if term_dic['Example'] != '':
idb_list[5] = bb + term_dic['Example'] + ie
ide_list[5] = eb + term_dic['Example'] + ie
if term_dic['Exercise'] != '':
idb_list[6] = bb + term_dic['Exercise'] + ie
ide_list[6] = eb + term_dic['Exercise'] + ie
if term_dic['Notation'] != '':
idb_list[7] = bb + term_dic['Notation'] + ie
ide_list[7] = eb + term_dic['Notation'] + ie
if term_dic['Proposition'] != '':
idb_list[8] = bb + term_dic['Proposition'] + ie
ide_list[8] = eb + term_dic['Proposition'] + ie
print(idb_list)
print(ide_list)
Here are some of the parsing functions:
def t_TH(t):
r'\\newtheorem\{'
t.lexer.begin('th')
return t
def t_th_THCS(t):
r'\}\['
t.lexer.begin('thc')
return t
def t_tht_THC(t):
r'\}'
if term_dic.has_key(thn_temp) == False:
print(f"{thn_temp} is unknown!")
elif len(th_temp) == 0:
print(f"No abbreviation for {thn_temp} is found!")
else:
term_dic[thn_temp] = th_temp
print(f"The abbreviation for {thn_temp} is {th_temp}!")
th_temp = ''
thn_temp = ''
t.lexer.begin('INITIAL')
return t
def t_th_THCC(t):
r'\}\{'
t.lexer.begin('tht')
return t
def t_thc_THSC(t):
r'\]\{'
t.lexer.begin('tht')
return t
#TOKEN(idb_list[0])
def t_ig_BT(t):
t.lexer.begin('INITIAL')
return t
#TOKEN(ide_list[0])
def t_ET(t):
t.lexer.begin('ig')
return t
def t_INITIAL_sec_thc_TEXT(t):
r'[\s\S]'
return t
def t_th_TEXT(t):
r'[\s\S]'
th_temp = th_temp + t.value()
return t
def t_tht_TEXT(t):
r'[\s\S]'
thn_temp = thn_temp + t.value()
return t
def t_ig_ITEXT(t):
r'[\s\S]'
pass
import ply.lex as lex
lex.lex(debug=True, debuglog = log)
Here are the errors:
ERROR: /Users/CatLover/Documents/Python_Beta/TexExtractor/texlexparse.py:154: No regular expression defined for rule 't_ET'
I don't know why the regular expression defined for 't_ET' etc using #TOKEN do not work.
Ply is a parser generator. It takes your parser/lexer description and compiles a parser/lexer from it. You cannot change the description of the language during the parse.
In this particular case, you might be better off writing a streaming ("online") scanner. But if you want to use Ply, then you will be better off not trying to modify the grammar to ignore parts of the input. Just parse the entire input and ignore the parts you're not interested in. You'll probably find that the code is much simpler.

Windows/Python Error WindowsError: [Error 3] The system cannot find the path specified

Hi I am new to python and i need some help. I trying to run a file on Windows 10 OS with python 2.7.
import os
import re
import codecs
import numpy as np
import theano
models_path = "./models"
eval_path = "./evaluation"
eval_temp = os.path.join(eval_path, "temp")
eval_script = os.path.join(eval_path, "conlleval")
def get_name(parameters):
"""
Generate a model name from its parameters.
"""
l = []
for k, v in parameters.items():
if type(v) is str and "/" in v:
l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
else:
l.append((k, v))
name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
return "".join(i for i in name if i not in "\/:*?<>|")
def set_values(name, param, pretrained):
"""
Initialize a network parameter with pretrained values.
We check that sizes are compatible.
"""
param_value = param.get_value()
if pretrained.size != param_value.size:
raise Exception(
"Size mismatch for parameter %s. Expected %i, found %i."
% (name, param_value.size, pretrained.size)
)
param.set_value(np.reshape(
pretrained, param_value.shape
).astype(np.float32))
def shared(shape, name):
"""
Create a shared object of a numpy array.
"""
if len(shape) == 1:
value = np.zeros(shape) # bias are initialized with zeros
else:
drange = np.sqrt(6. / (np.sum(shape)))
value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
return theano.shared(value=value.astype(theano.config.floatX), name=name)
def create_dico(item_list):
"""
Create a dictionary of items from a list of list of items.
"""
assert type(item_list) is list
dico = {}
for items in item_list:
for item in items:
if item not in dico:
dico[item] = 1
else:
dico[item] += 1
return dico
def create_mapping(dico):
"""
Create a mapping (item to ID / ID to item) from a dictionary.
Items are ordered by decreasing frequency.
"""
sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
item_to_id = {v: k for k, v in id_to_item.items()}
return item_to_id, id_to_item
def zero_digits(s):
"""
Replace every digit in a string by a zero.
"""
return re.sub('\d', '0', s)
def iob2(tags):
"""
Check that tags have a valid IOB format.
Tags in IOB1 format are converted to IOB2.
"""
for i, tag in enumerate(tags):
if tag == 'O':
continue
split = tag.split('-')
if len(split) != 2 or split[0] not in ['I', 'B']:
return False
if split[0] == 'B':
continue
elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = 'B' + tag[1:]
return True
def iob_iobes(tags):
"""
IOB -> IOBES
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
elif tag.split('-')[0] == 'B':
if i + 1 != len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif tag.split('-')[0] == 'I':
if i + 1 < len(tags) and \
tags[i + 1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise Exception('Invalid IOB format!')
return new_tags
def iobes_iob(tags):
"""
IOBES -> IOB
"""
new_tags = []
for i, tag in enumerate(tags):
if tag.split('-')[0] == 'B':
new_tags.append(tag)
elif tag.split('-')[0] == 'I':
new_tags.append(tag)
elif tag.split('-')[0] == 'S':
new_tags.append(tag.replace('S-', 'B-'))
elif tag.split('-')[0] == 'E':
new_tags.append(tag.replace('E-', 'I-'))
elif tag.split('-')[0] == 'O':
new_tags.append(tag)
else:
raise Exception('Invalid format!')
return new_tags
def insert_singletons(words, singletons, p=0.5):
"""
Replace singletons by the unknown word with a probability p.
"""
new_words = []
for word in words:
if word in singletons and np.random.uniform() < p:
new_words.append(0)
else:
new_words.append(word)
return new_words
def pad_word_chars(words):
"""
Pad the characters of the words in a sentence.
Input:
- list of lists of ints (list of words, a word being a list of char indexes)
Output:
- padded list of lists of ints
- padded list of lists of ints (where chars are reversed)
- list of ints corresponding to the index of the last character of each word
"""
max_length = max([len(word) for word in words])
char_for = []
char_rev = []
char_pos = []
for word in words:
padding = [0] * (max_length - len(word))
char_for.append(word + padding)
char_rev.append(word[::-1] + padding)
char_pos.append(len(word) - 1)
return char_for, char_rev, char_pos
def create_input(data, parameters, add_label, singletons=None):
"""
Take sentence data and return an input for
the training or the evaluation function.
"""
words = data['words']
chars = data['chars']
if singletons is not None:
words = insert_singletons(words, singletons)
if parameters['cap_dim']:
caps = data['caps']
char_for, char_rev, char_pos = pad_word_chars(chars)
input = []
if parameters['word_dim']:
input.append(words)
if parameters['char_dim']:
input.append(char_for)
if parameters['char_bidirect']:
input.append(char_rev)
input.append(char_pos)
if parameters['cap_dim']:
input.append(caps)
if add_label:
input.append(data['tags'])
return input
def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
id_to_tag, dictionary_tags, eval_id):
"""
Evaluate current model using CoNLL script.
"""
n_tags = len(id_to_tag)
predictions = []
count = np.zeros((n_tags, n_tags), dtype=np.int32)
for raw_sentence, data in zip(raw_sentences, parsed_sentences):
input = create_input(data, parameters, False)
if parameters['crf']:
y_preds = np.array(f_eval(*input))[1:-1]
else:
y_preds = f_eval(*input).argmax(axis=1)
y_reals = np.array(data['tags']).astype(np.int32)
assert len(y_preds) == len(y_reals)
p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
r_tags = [id_to_tag[y_real] for y_real in y_reals]
if parameters['tag_scheme'] == 'iobes':
p_tags = iobes_iob(p_tags)
r_tags = iobes_iob(r_tags)
for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
predictions.append(new_line)
count[y_real, y_pred] += 1
predictions.append("")
# Write predictions to disk and run CoNLL script externally
#eval_id = np.random.randint(1000000, 2000000)
output_path = os.path.join(eval_temp, "eval.%i.output" % eval_id)
scores_path = os.path.join(eval_temp, "eval.%i.scores" % eval_id)
with codecs.open(output_path, 'w', 'utf8') as f:
f.write("\n".join(predictions))
os.system("%s < %s > %s" % (eval_script, output_path, scores_path))
# CoNLL evaluation results
eval_lines = [l.rstrip() for l in codecs.open(scores_path, 'r', 'utf8')]
#trainLog = open('train.log', 'w')
for line in eval_lines:
print line
#trainLog.write("%s\n" % line)
# Remove temp files
# os.remove(output_path)
# os.remove(scores_path)
# Confusion matrix with accuracy for each tag
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
"ID", "NE", "Total",
*([id_to_tag[i] for i in xrange(n_tags)] + ["Percent"])
)
for i in xrange(n_tags):
print ("{: >2}{: >7}{: >7}%s{: >9}" % ("{: >7}" * n_tags)).format(
str(i), id_to_tag[i], str(count[i].sum()),
*([count[i][j] for j in xrange(n_tags)] +
["%.3f" % (count[i][i] * 100. / max(1, count[i].sum()))])
)
# Global accuracy
print "%i/%i (%.5f%%)" % (
count.trace(), count.sum(), 100. * count.trace() / max(1, count.sum())
)
# F1 on all entities
return float(eval_lines[1].strip().split()[-1])
When i compile the code as it is i always get the error.I think its either because of restriction on path length in windows or it needs or slashes. I dont know what to add to subtract in order to resolve the problem.
run train.py --train lstm/fold1/train --dev lstm/fold1/dev --test lstm/fold1/test
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
Using gpu device 0: GeForce GT 620M (CNMeM is enabled with initial size: 85.0% of memory, cuDNN not available)
Traceback (most recent call last):
File "E:\New-Code\tagger-master\tagger-master\train.py", line 135, in
model = Model(parameters=parameters, models_path=models_path)
File "model.py", line 36, in init
os.makedirs(self.model_path)
File "C:\Users\Acer\Anaconda2\envs\env_name27\lib\os.py", line 157, in makedirs
mkdir(name, mode)
WindowsError: [Error 3] The system cannot find the path specified: './models\tag_scheme=iob,lower=False,zeros=False,char_dim=25,char_lstm_dim=25,char_bidirect=True,word_dim=100,word_lstm_dim=100,word_bidirect=True,pre_emb=,all_emb=False,cap_dim=0,crf=True,dropout=0.3,lr_method=sgd-lr_.005'
In windows pathe is given by back slash \ instead of forward slash / which is used in linux/unix.
Try it like blow if file is 1 folder back:
models_path = "..\models"
eval_path = "..\evaluation"

Working with nested dictionaries and formatting for display

I have a partial answer from here Construct a tree from list os file paths (Python) - Performance dependent
My specific problem requires me to go from
this
dir/file 10
dir/dir2/file2 20
dir/dir2/file3 10
dir/file3 10
dir3/file4 10
dir3/file5 10
To
dir/ **50**
dir2/ **30**
file2
file3
file
file3
dir3/ **20**
file4
file5
Basically the numbers at the end are the file sizes and
I have been trying to figure out how to display the size of all the files to the parent directory
Edit:
r = re.compile(r'(.+\t)(\d+)')
def prettify(d, indent=0):
for key, value in d.iteritems():
ss = 0
if key == FILE_MARKER:
if value:
for each in value:
mm = r.match(each)
ss += int(mm.group(2))
print ' ' * indent + each
***print ' ' * indent + format_size(ss)***
else:
print ' ' * indent + str(key)
if isinstance(value, dict):
addSizes(value, indent+1)
else:
print ' ' * (indent+1) + str(value)
This is mac's answer from the above link which i edited to use regExp
Solutions that occurred to me led me to create a new dict or adding an inner function.
I have lost my whole day and wished i had asked for help earlier in the day.
Please help.
Not the most elegant thing in the world, but this should get you where you need to be. You'll need to change the tree creation function to deal with whatever form of input you are getting. Once the tree is generated it's just using a recursive tree traversal to form the output.
import re
input_dirs = """dir/file 10
dir/dir2/file2 20
dir/dir2/file3 10
dir/file 10
dir3/file4 10
dir3/file5 10
dir/dir2/dir4/file2 10"""
def create_file_tree(input_string):
dir_dict = {}
for file_path in input_string.split('\n'):
path_list = re.sub('/',' ',file_path).split()
path_list[-1] = int(path_list[-1])
path_dict = dir_dict
final_item = ""
for item in path_list[:-1]:
parent_dict = path_dict
last_item = item
path_dict = path_dict.setdefault(item,{})
parent_dict[last_item] = path_list[-1]
return dir_dict
def pretty_file_tree(file_tree):
def traverse(sub_dict,indent=0, total=0):
string_out = ""
indent += 1
for key in sorted(sub_dict.keys()):
if type(sub_dict[key]) == dict:
sub_total = traverse(sub_dict[key],indent,0)
total += sub_total[0]
string_out += ' '*indent + key + ' ' + '**' + str(sub_total[0]) + '**' + '\n' + sub_total[1]
else:
string_out += ' '*indent + key + '\n'
total += sub_dict[key]
return total, string_out
output_string = traverse(file_tree)
print(output_string[1])
pretty_file_tree(create_file_tree(input_dirs))
Sorry it's not following the code you posted, but i'd begun to produce this before the edit...
As you process the input build a string with place holders (%d) for the numbers, then print out the string.

Python Write To File Missing Lines

I'm having trouble using python to write strings into a file:
(what I'm trying to do is using python to generate some C programs)
The code I have is the following:
filename = "test.txt"
i = 0
string = "image"
tempstr = ""
average1 = "average"
average2 = "average*average"
output = ""
FILE = open(filename,"w")
while i < 20:
j = 0
output = "square_sum = square_sum + "
while j < 20:
tempstr = string + "_" + str(i) + "_" + str(j)
output = output + tempstr + "*" + tempstr + " + " + average2 + " - 2*" + average1 + "*" + tempstr
if j != 19:
output = output + " + "
if j == 19:
output = output + ";"
j = j + 1
output = output + "\n"
i = i + 1
print(output)
FILE.writelines(output)
FILE.close
The print gives me correct output, but the FILE has last line missing and some of the second last line missing. What's the problem in writing strings into file?
Thank you!
Probably help if you called the method...
FILE.close()
The problem is that you aren't calling the close() method, just mentioning it in the last line. You need parens to invoke a function.
Python's with statement can make that unnecessary though:
with open(filename,"w") as the_file:
while i < 20:
j = 0
output = "square_sum = square_sum + "
...
print(output)
the_file.writelines(output)
When the with clause is exited, the_file will be closed automatically.
Try:
with open(filename,"w") as FILE:
while i < 20:
# rest of your code with proper indent...
no close needed...
First, a Pythonified version of your code:
img = 'image_{i}_{j}'
avg = 'average'
clause = '{img}*{img} + {avg}*{avg} - 2*{avg}*{img}'.format(img=img, avg=avg)
clauses = (clause.format(i=i, j=j) for i in xrange(20) for j in xrange(20))
joinstr = '\n + '
output = 'square_sum = {};'.format(joinstr.join(clauses))
fname = 'output.c'
with open(fname, 'w') as outf:
print output
outf.write(output)
Second, it looks like you are hoping to speed up your C code by fanatical inlining. I very much doubt the speed gains will justify your efforts over something like
maxi = 20;
maxj = 20;
sum = 0;
sqsum = 0;
for(i=0; i<maxi; i++)
for(j=0; j<maxj; j++) {
t = image[i][j];
sum += t;
sqsum += t*t;
}
square_sum = sqsum + maxi*maxj*average*average - 2*sum*average;
Looks like your indentation may be incorrect, but just some other comments about your code:
writelines() writes the content of a list or iterator to the file.
Since your outputting a single string, just use write().
lines ["lineone\n", "line two\n"]
f = open("myfile.txt", "w")
f.writelines(lines)
f.close()
Or just:
output = "big long string\nOf something important\n"
f = open("myfile.txt", "w")
f.write(output)
f.close()
As another side note it maybe helpful to use the += operator.
output += "more text"
# is equivalent to
output = output + "more text"

Categories

Resources