I have two files. One creates a numpy array in compressed sparse row format
from sklearn.feature_extraction.text import TfidfTransformer
import pdb
def stem_document(document):
translatedict = ""
stemmer = PorterStemmer()
for word in string.punctuation:
translatedict = translatedict + word
doc_stemmed = []
for word in document.split():
lowerstrippedword = ''.join(c for c in word.lower() if c not in translatedict)
try:
stemmed_word = stemmer.stem(lowerstrippedword)
doc_stemmed.append(stemmed_word)
except:
print lowerstrippedword + " could not be stemmed."
return ' '.join(doc_stemmed)
def readFileandStem(filestring):
with open(filestring, 'r') as file:
reader = csv.reader(file)
file_extras = []
vector_data = []
error = False
while (error == False):
try:
next = reader.next()
if len(next) == 3 and next[2] != "":
document = next[2]
stemmed_document = stem_document(document)
vector_data.append(stemmed_document)
file_extra = []
file_extra.append(next[0])
file_extra.append(next[1])
file_extras.append(file_extra)
except:
error = True
return [vector_data, file_extras]
filestring = 'Data.csv'
print "Reading File"
data = readFileandStem(filestring)
documents = data[0]
file_extras = data[1]
print "Vectorizing Data"
vectorizer = CountVectorizer()
matrix = vectorizer.fit_transform(documents)
tf_idf_transform = TfidfTransformer(use_idf=False).fit(matrix)
tf_idf_matrix = tf_idf_transform.transform(matrix)
with open('matrix/matrix.npy', 'w') as matrix_file:
np.save(matrix_file, tf_idf_matrix)
file_json_map = {}
file_json_map['extras'] = file_extras
with open('matrix/extras.json', 'w') as extras_file:
extras_file.write(json.dumps(file_json_map))
print "finished"
The next file is supposed to load the same file...
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
import json
import pdb
with open('matrix/matrix.npy', 'r') as matrix_file:
matrix = np.load(matrix_file)
hcluster = linkage(matrix, "complete")
However, I get the following error:
File "Cluster.py", line 7, in <module>
matrix = np.load(matrix_file)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\npyio.py", line 406, in load
pickle_kwargs=pickle_kwargs)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 620, in read_array
version = read_magic(fp)
File "C:\Users\jarek\Anaconda2\lib\site-packages\numpy\lib\format.py", line 216, in read_magic
raise ValueError(msg % (MAGIC_PREFIX, magic_str[:-2]))
ValueError: the magic string is not correct; expected '\x93NUMPY', got '\x00\x00I\x1c\x00\x00'
I don't know why the magic string would be incorrect because from what I've looked into, all .npy files are supposed to have the same magic string "\x93NUMPY".
Ideas?
I encountered similar issue before.
Changing
open('matrix/matrix.npy', 'w')
...
open('matrix/matrix.npy', 'r')
to
open('matrix/matrix.npy', 'wb')
...
open('matrix/matrix.npy', 'rb')
solved my problem.
Related
I am using a script in fusion360 called importsplinecsv
I was wondering if it was possible to modify the script so that it would import one row every 10th row?
as the amount of rows that are being imported are very large and bloating.
if I could get some help that would be awesome.
here is the text
Author-Autodesk Inc.
Description-Import spline from csv file
import adsk.core, adsk.fusion, traceback
import io
def run(context):
ui = None
try:
app = adsk.core.Application.get()
ui = app.userInterface
# Get all components in the active design.
product = app.activeProduct
design = adsk.fusion.Design.cast(product)
title = 'Import Spline csv'
if not design:
ui.messageBox('No active Fusion design', title)
return
dlg = ui.createFileDialog()
dlg.title = 'Open CSV File'
dlg.filter = 'Comma Separated Values (*.csv);;All Files (*.*)'
if dlg.showOpen() != adsk.core.DialogResults.DialogOK :
return
filename = dlg.filename
with io.open(filename, 'r', encoding='utf-8-sig') as f:
points = adsk.core.ObjectCollection.create()
line = f.readline()
data = []
while line:
pntStrArr = line.split(',')
for pntStr in pntStrArr:
try:
data.append(float(pntStr))
except:
break
if len(data) >= 3 :
point = adsk.core.Point3D.create(data[0], data[1], data[2])
points.add(point)
line = f.readline()
data.clear()
if points.count:
root = design.rootComponent
sketch = root.sketches.add(root.xYConstructionPlane)
sketch.sketchCurves.sketchFittedSplines.add(points)
else:
ui.messageBox('No valid points', title)
except:
if ui:
ui.messageBox('Failed:\n{}'.format(traceback.format_exc()))
I have not used this library before but try:
for i, line in enumerate(f):
if i%10==0:
then your import command here
f is your filepointer
i will be the linenumber and line will be your line
dlg = ui.createFileDialog()
dlg.title = 'Open CSV File'
dlg.filter = 'Comma Separated Values (*.csv);;All Files (*.*)'
if dlg.showOpen() != adsk.core.DialogResults.DialogOK :
return
filename = dlg.filename
with io.open(filename, 'r', encoding='utf-8-sig') as f:
points = adsk.core.ObjectCollection.create()
for i, line in enumerate(f):
if i%10==0:
while line:
pntStrArr = line.split(',')
for pntStr in pntStrArr:
try:
data.append(float(pntStr))
except:
break
if len(data) >= 3 :
point = adsk.core.Point3D.create(data[0], data[1], data[2])
points.add(point)
line = f.readline()
data.clear()
if points.count:
root = design.rootComponent
sketch = root.sketches.add(root.xYConstructionPlane)
sketch.sketchCurves.sketchFittedSplines.add(points)
else:
ui.messageBox('No valid points', title)
except:
if ui:
ui.messageBox('Failed:\n{}'.format(traceback.format_exc()))
I am trying to clean up so text files in python. I want to take out stop words, digits and the new line character. But I keep getting coercing to Unicode python text . Here is my code:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from string import digits
def cleanupDoc(s):
s = s.translate(None,digits)
s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
doc = open(fdoc)
newDoc = cleanupDoc(doc)
doc.close()
My Error
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: coercing to Unicode: need string or buffer, list found
tfile.readlines() gives you a list of lines, which you are appending to another list:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist.append(line)
In result, you have a list of lists in mylist.
The following should fix the problem:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist += line
This will give you a list of strings in mylist.
import nltk
form nltk import word_tokenize
from nltk.corpus import stopwords
#nltk.download()
import string
from string import digits
import glob
import re
def cleanupDoc(s):
#s = s.translate(None,digits)
#s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
# remove \n or digit from fdoc
fdoc = [re.sub(r'[\"\n]|\d', '', x) for x in fdoc]
# convert list to string
fdoc = ''.join(fdoc)
print fdoc
newDoc = cleanupDoc(fdoc)
print " newDoc: " , newDoc
I have to create a function that reads a random line from a text file in python.
I have the following code but am not able to get it to work
import random
def randomLine(filename):
#Retrieve a random line from a file, reading through the file irrespective of the length
fh = open(filename.txt, "r")
lineNum = 0
it = ''
while 1:
aLine = fh.readline()
lineNum = lineNum + 1
if aLine != "":
# How likely is it that this is the last line of the file ?
if random.uniform(0,lineNum)<1:
it = aLine
else:
break
fh.close()
return it
print(randomLine(testfile.txt))
I got so far but,need help to go further, Please help
once the program is running i'm getting an error saying
print(randomLine(testfile.txt))
NameError: name 'testfile' is not defined
Here's a version that's been tested to work, and avoids empty lines.
Variable names are verbose for clarity.
import random
import sys
def random_line(file_handle):
lines = file_handle.readlines()
num_lines = len(lines)
random_line = None
while not random_line:
random_line_num = random.randint(0, num_lines - 1)
random_line = lines[random_line_num]
random_line = random_line.strip()
return random_line
file_handle = None
if len(sys.argv) < 2:
sys.stderr.write("Reading stdin\n")
file_handle = sys.stdin
else:
file_handle = open(sys.argv[1])
print(random_line(file_handle))
file_handle.close()
################################################################################
#...................Program to create feature vector (N-grams) ...............
################################################################################
import ast
import csv
import os
import sys
from string import *
from BST import Node
import ast
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
def File_Write(filename,write_ist):
filewrite=open(filename,"w")
filewrite.writerows(str(write_ist))
filewrite.close()
#################################################################################
#................Function to read the files and create the FVT...................
# Change 2,3,4,5,6,7
#################################################################################
def read_file_list(bigram,class_label) :
frqlist=[]
root.reset_frequency()
for i in range (0,len(bigram)):
row=str(bigram[i])
row = row.strip()
row = row.replace('(','')
row = row.replace(')','')
row = row.replace("'",'')
row = row.replace(",",' ')
node, parent = root.lookup(row)
if node :
root.increment_frequency(node)
frqlist = root.print_tree()
## Attach the class_label............
root.finalize_frq_lst(class_label )
root.write_to_csv(file_write1)
##################################################################################
#.................................MAIN PROGRAM....................................
##################################################################################
feature_list = ""
root_flag = 'false'
file_path_data = "/home/xxx/Project/Dataset/Cross/N_grams/7_POStags.csv" ;##Input file containing Bigrams of blogdata
fp_data=csv.reader(open(file_path_data,"r"),delimiter=',')
file_path_feature = "/home/xxx/Project/Dataset/Cross/N_gram_Features/7_gram.txt" ;##Input file containing sorted Bigrams
fp_feature=open(file_path_feature,"r")
list1=fp_feature.read()
#### Convert String into list ...................
read_list=ast.literal_eval(list1)
read_list1=list(set(read_list))
print read_list1
for i in range(0,len(read_list)):
feature=str(read_list[i])
feature = feature.strip()
feature = feature.replace('(','')
feature = feature.replace(')','')
feature = feature.replace("'",'')
feature = feature.replace(",",' ')
if root_flag == 'false':
root = Node( feature )
root_flag = 'true'
else :
root.insert(feature)
feature_list = feature_list + "\n" + feature
feature_list1 = feature_list.strip()
line = feature_list1.split('\n')
##print "#######################################################################"
##print line
line1 = list(set(line))
print len(line1)
##print "#######################################################################"
line1.sort()
i=1
######Setting the path for input and output files .......................
output_file = "/home/xxx/Project/Dataset/Cross/N_grams_recored/7_gram.csv" ;##Output file..............
with open(output_file,'w') as fo :
file_write1 = csv.writer(fo, delimiter=',', quotechar='"')
file_write1.writerow(line1)
#### Write header data into output file
for data in fp_data :
feature=ast.literal_eval(data[0])
class_label=data[1]
read_file_list(feature,class_label)
print feature
print i
i=i+1
This is my code to record count of 7-grams in a 3277 sample data.i am trying to create BST of 76000 something 7-grams.but am getting error like this
Traceback (most recent call last):
File "N_gram_Record (2-7).py", line 79, in <module>
read_list=ast.literal_eval(list1)
File "/usr/lib/python2.7/ast.py", line 49, in literal_eval
node_or_string = parse(node_or_string, mode='eval')
File "/usr/lib/python2.7/ast.py", line 37, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
MemoryError
i think memory error is getting when am trying to create bst of 7-grams since its of 76000 in count.so any idea to overcome this problem???
Long time listener, first time caller! So, I have this Python script that is for parsing a Google Base Feed text file. It's taking out particular pieces of data and creating a formatted file I can upload on to Bing Shopping. After finally getting it to run, I've discovered that it just outputs blank files instead of the cleaned up data I wanted. What am I missing here? I really appreciate any help! Fair warning, I'm a pretty big Python newb, and I've had a lot of help writing this already.
import sys,os
import pandas as pd
import datetime
def remove_quotes(data):
lines = data.split('\n')
for i, line in enumerate(lines):
lines[i] = lines[i].replace('"','')
print lines[i]
return data
def tab_error(index, line, output):
count = len(line.split('\t'))
if count != 19:
err = 'Tab issue at line {linenum} : {numtabs} extra tabs'.\
format(linenum=index,numtabs=(count-19))
print err
output.write(err+'\n')
return True
return False
def html_error(index, line, output):
htmltags = ['&fract12', '&39','&', '&qt;', '<', '&rt;','"','>','quot','’']
for tag in htmltags:
if line.find(tag) > 0:
err = 'HTML issue at line {linenum}'.\
format(linenum=index)
print err
output.write(err+'\n')
return True
return False
def read_data(filename):
with open(filename,'r') as infile:
data = infile.read()
return data
def tabs_check(data, output, filename):
with open(filename,'w') as cleanfile:
header = ''
for x in xrange(19):
header += 'x'+str(x+1)+'\t'
cleanfile.write(header)
# for each line in the file
for i, line in enumerate(data.split('\r')[1:]):
# check line for tabs error
data_error = tab_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error:
cleanfile.write('\n'+newline)
def html_check(data, output, filename):
with open(filename,'w') as cleanfile:
# for each line in the file
lines = data.split('\n')
cleanfile.write(lines[0])
for i, line in enumerate(lines[1:]):
# check line for HTML errors
data_error = html_error(i, line, output)
newline = line.replace('"','')
newline=newline.strip()
if not data_error and newline:
cleanfile.write('\n'+newline)
if __name__ == '__main__':
# Clean tabs
filename = sys.argv[1]
ts = datetime.datetime.now().isoformat()
print ts
with open('bing_errors.txt','w') as output:
# print 'Removing quotes within .. product description and ...'
# data = remove_quotes(data)
print 'Removing lines with more than 19 tabs...'
data = read_data(filename)
tabs_check(data, output, 'clean19.txt')
# Delete and reorder columns
print 'Deleting and reordering columns...'
df = pd.read_table('clean19.txt')
tmp = df[['x8','x2','x3','x4','x6','x1','x5']]
tmp.columns = ['MPID',
'Brand (BrandorManufacturer)',
'Title',
'Item Description',
'Price',
'ProductURL',
'ImageURL']
tmp.to_csv('tmp.txt', index=False, sep='\t')
os.remove('clean19.txt')
#HTML errors
print 'Checking for HTML errors...'
data = read_data('tmp.txt')
html_check(data, output, 'BT1.txt')
os.remove('tmp.txt')
# row = tmp[tmp['MPID'] == 8724]
# print row