Related
This is what I have gotten while trying to run step 3 of this source code:
https://github.com/carykh/lazykh
Error:
Traceback (most recent call last):
File "C:\Users\User\Desktop\lazykh-main\code\scheduler.py", line 93, in
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
ValueError: substring not found
Code:
import argparse
import os.path
import json
import numpy as np
import random
def addPhoneme(p, t):
global prevPhoneme
global f
if p != prevPhoneme:
strings[4] += (str.format('{0:.3f}', t)+",phoneme,"+p+"\n")
prevPhoneme = p
def pickNewPose(t):
global pose
global prevPose
global POSE_COUNT
global prevPhoneme
global f
newPose = -1
while newPose == -1 or newPose == pose or newPose == prevPose:
newPose = int(random.random()*POSE_COUNT)
prevPose = pose
pose = newPose
strings[3] += (str.format('{0:.3f}', t)+",pose,"+str(pose)+"\n")
prevPhoneme = "na"
strings = [""]*5
POSE_COUNT = 5
emotions = {}
emotions["explain"] = 0
emotions["happy"] = 1
emotions["sad"] = 2
emotions["angry"] = 3
emotions["confused"] = 4
emotions["rq"] = 5
mouthList = [["aa","a"],["ae","a"],["ah","a"],["ao","a"],["aw","au"],
["ay","ay"],["b","m"],["ch","t"],["d","t"],["dh","t"],
["eh","a"],["er","u"],["ey","ay"],["f","f"],["g","t"],
["hh","y"],["ih","a"],["iy","ay"],["jh","t"],["k","t"],
["l","y"],["m","m"],["n","t"],["ng","t"],["ow","au"],
["oy","ua"],["p","m"],["r","u"],["s","t"],["sh","t"],
["t","t"],["th","t"],["uh","u"],["uw","u"],["v","f"],
["w","u"],["y","y"],["z","t"],["zh","t"],
["oov","m"]] # For unknown phonemes, the stick figure will just have a closed mouth ("mmm")
mouths = {}
for x in mouthList:
mouths[x[0]] = x[1]
ENDING_PHONEME = "m"
STOPPERS = [",",";",".",":","!","?"]
parser = argparse.ArgumentParser(description='blah')
parser.add_argument('--input_file', type=str, help='the script')
args = parser.parse_args()
INPUT_FILE = args.input_file
f = open(INPUT_FILE+".txt","r+")
originalScript = f.read()
f.close()
f = open(INPUT_FILE+".json","r+")
fileData = f.read()
f.close()
data = json.loads(fileData)
WORD_COUNT = len(data['words'])
pose = -1
prevPose = -1
prevPhoneme = "na"
emotion = "0"
pararaph = 0
image = 0
OS_IndexAt = 0
pickNewPose(0)
strings[1] += "0,emotion,0\n"
strings[0] += "0,paragraph,0\n"
strings[2] += "0,image,0\n"
strings[4] += "0,phoneme,m\n"
for i in range(WORD_COUNT):
word = data['words'][i]
if "start" not in word:
continue
wordString = word["word"]
timeStart = word["start"]
OS_nextIndex = originalScript.index(wordString,OS_IndexAt)+len(wordString)
if "<" in originalScript[OS_IndexAt:]:
tagStart = originalScript.index("<",OS_IndexAt)
tagEnd = originalScript.index(">",OS_IndexAt)
if OS_nextIndex > tagStart and tagEnd >= OS_nextIndex:
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
nextDigest = originalScript[OS_IndexAt:OS_nextIndex]
if "\n" in nextDigest and data['words'][i-1]['case'] != 'not-found-in-audio' and (prevPhoneme == "a" or prevPhoneme == "f" or prevPhoneme == "u" or prevPhoneme == "y"):
addPhoneme("m", data['words'][i-1]["end"])
"""print(wordString)
print(str(OS_IndexAt)+", "+str(OS_nextIndex))
print(nextDigest)
print("")"""
pickedPose = False
for stopper in STOPPERS:
if stopper in nextDigest:
pickNewPose(timeStart)
pickedPose = True
if "<" in nextDigest:
leftIndex = nextDigest.index("<")+1
rightIndex = nextDigest.index(">")
emotion = emotions[nextDigest[leftIndex:rightIndex]]
strings[1] += (str.format('{0:.3f}', timeStart)+",emotion,"+str(emotion)+"\n")
prevPhoneme = "na"
if "\n\n" in nextDigest:
pararaph += 1
image += 1 # The line of the script advances 2 lines whenever we hit a /n/n.
strings[0] += (str.format('{0:.3f}', timeStart)+",paragraph,"+str(pararaph)+"\n")
prevPhoneme = "na"
if "\n" in nextDigest:
image += 1
strings[2] += (str.format('{0:.3f}', timeStart)+",image,"+str(image)+"\n")
prevPhoneme = "na"
if not pickedPose:
pickNewPose(timeStart) # A new image means we also need to have a new pose
phones = word["phones"]
timeAt = timeStart
for phone in phones:
timeAt += phone["duration"]
phoneString = phone["phone"]
if phoneString == "sil":
truePhone = "m"
else:
truePhone = mouths[phoneString[:phoneString.index("_")]]
if len(truePhone) == 2:
addPhoneme(truePhone[0], timeAt-phone["duration"])
addPhoneme(truePhone[1], timeAt-phone["duration"]*0.5)
else:
addPhoneme(truePhone, timeAt-phone["duration"])
OS_IndexAt = OS_nextIndex
f = open(INPUT_FILE+"_schedule.csv","w+")
for i in range(len(strings)):
f.write(strings[i])
if i < len(strings)-1:
f.write("SECTION\n")
f.flush()
f.close()
print(f"Done creating schedule for {INPUT_FILE}.")
The
ValueError: substring not found
occurs when you try to find the index of a substring in a string which does not contain it in the specified (or default) section, using the index function.
The index method takes 3 parameters:
value
start
end
and it searches for the value between start and end.
So, the error occurred because the substring was not found in the section where it was searched for. The line of
OS_nextIndex = originalScript.index(wordString,tagEnd)+len(wordString)
searches for wordString, starting from tagEnd and searches for the likes of
<span>yourwordstring</span>
, but in your case it was not found. You can do one of the following to solve the issue:
you can fix your input if it should always have a match for the search
you can handle the error when the index throws the error
you can use find instead, see https://bobbyhadz.com/blog/python-valueerror-substring-not-found
Note that find also has three parameters, as you can read from https://www.w3schools.com/python/ref_string_find.asp
I am a chip test engineer, and I have one big text file about 8KK lines. For this file, most lines include '='. Meanwhile I have a log file, which is about 300K lines, each line is show a test failure. I need to change the 300K lines of the original file.
Currently it takes about 15 hours to finish the job.
I have existing solution, but it is too slow.
For the code, the parse_log is used to process the log file and get to know each modification to be made, and the stil_parse include below function:
read file as list in memory;
iterate the file, and modify each line in list if included in log file;
write back to disk;
class MaskStil:
def __init__(self):
self.log_signal_file = ''
self.pattern = r"^([^:]+)(:)(\d+)(\s+)(\d+)(\s+)(\d+)(\s+)(\d+)(\s)([.LH]+)$"
self.log_signal = {}
self.log_lines = []
self.mask_dict = {}
self.stil_name_new = ''
self.stil_name = ''
self.signal_all = {}
self.signal_group = []
self.offset = 0
self.mask_mode = -1 # mask_mode 0: revert between L/H; mask_mode 1: mask L/H to Z
self.convert_value=[{"L":"H", "H":"L"}, {"L":"Z", "H":"Z"}]
for i in range(100):
self.log_signal[i] = ''
def digest(self, log_signal, stil_file, signal_group, offset, mask_mode = 1):
self.log_signal_file = log_signal
self.stil_name = stil_file
self.stil_name_new = stil_file[:-5] + '_mask.stil'
self.signal_group = signal_group.replace('=', '+').strip().split('+')
self.offset = offset
self.mask_mode = mask_mode
for i in range(1, len(self.signal_group)):
self.signal_all[self.signal_group[i]] = (i - 1) / 10 + i
print(self.signal_all)
self.parse_log()
self.stil_parse()
def parse_log(self):
with open(self.log_signal_file) as infile:
line_num = 0
blank_line = 0
for line in infile:
line_num += 1
if line_num == 1:
blank_line = line.count(' ')
if "------------------" in line:
break
for i in range(blank_line, len(line)):
self.log_signal[i - blank_line] += line[i]
for (key, value) in self.log_signal.items():
self.log_signal[key] = value.rstrip()
print(self.log_signal)
with open(self.log_signal_file) as log_in:
self.log_lines = log_in.read().splitlines()
for line in self.log_lines:
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
def stil_parse(self):
cycle_keys = []
vector_num = 0
for i in self.mask_dict.keys():
cycle_keys.append(i)
with open(self.stil_name, 'r') as stil_in:
stil_in_list = stil_in.read().splitlines()
total_len = len(stil_in_list)
vector_cycle_dict = {}
with tqdm(total=total_len, ncols=100, desc= " Stil Scanning in RAM Progress") as pbar:
for i_iter in range(total_len):
line = stil_in_list[i_iter]
pbar.update(1)
if "=" in line:
vector_num +=1
if (vector_num in cycle_keys):
vector_cycle_dict[vector_num] = i_iter
status = line[line.find("=") + 1:line.find(";")]
# if cycle + self.offset in cycle_keys:
if vector_num in cycle_keys:
match = 1
for (i, j) in self.mask_dict[vector_num].iteritems():
mask_point = i
mask_signal = self.log_signal[i]
mask_value = j
test_point = self.signal_all[mask_signal]
test_value = status[test_point]
if test_value != mask_value:
print("data did not match for cycle: ", test_value, " VS ", line, j, vector_num, mask_point, mask_signal, test_point, test_value)
match = 0
raise NameError
else:
status = status[:test_point] + self.convert_value[self.mask_mode][test_value] + status[test_point + 1:]
if match == 1:
replace_line = line[:line.find("=") + 1] + status + line[line.find(";"):]
print("data change from :", line)
print(" to:", replace_line)
stil_in_list[i_iter] = replace_line
else:
print("No matching for %d with %s" %(vector_num, line))
raise NameError
with tqdm(total=len(stil_in_list), ncols=100, desc= " Masked-stil to in RAM Progress") as pbar:
with open(self.stil_name_new, 'w') as stil_out:
for new_line in range(len(stil_in_list)):
pbar.update(1)
stil_out.write(new_line)
I was expecting a solution that could finish in about 1 or 2 hours.
As I mentioned in the comments, you can get some speedup by refactoring your code to be multithreaded or multiprocess.
I imagine you're also running into memory swapping issues here. If that's the case, this should help:
with open(self.log_signal_file) as log_in:
line = log_in.readline() # First line. Need logic to handle empty logs
while line: #Will return false at EOF
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
line = log_in.readline()
Here we only read in one line at a time, so you don't have to try to hold 8KK lines in memory
*In case anyone else didn't know, KK means million apparently.
I managed to optimized the solution, and the timing consumed tremendously reduced to about 1 minute.
Mainly the optimization is in below fields:
instead of keeping checking if (vector_num in cycle_keys):, I use
ordered list and always check whether equal to index_to_mask;
use variable line_find_equal and line_find_coma for further usage
class MaskStil:
def __init__(self):
self.log_signal_file = ''
self.pattern = r"^([^:]+)(:)(\d+)(\s+)(\d+)(\s+)(\d+)(\s+)(\d+)(\s)([.LH]+)$"
self.log_signal = {}
self.log_lines = []
self.mask_dict = {}
self.stil_name_new = ''
self.stil_name = ''
self.signal_all = {}
self.signal_group = []
self.offset = 0
self.mask_mode = -1 # mask_mode 0: revert between L/H; mask_mode 1: mask L/H to Z
self.convert_value=[{"L":"H", "H":"L"}, {"L":"Z", "H":"Z"}]
for i in range(100):
self.log_signal[i] = ''
def digest(self, log_signal, stil_file, signal_group, offset, mask_mode = 1):
self.log_signal_file = log_signal
self.stil_name = stil_file
self.stil_name_new = stil_file[:-5] + '_mask.stil'
self.signal_group = signal_group.replace('=', '+').strip().split('+')
self.offset = offset
self.mask_mode = mask_mode
for i in range(1, len(self.signal_group)):
self.signal_all[self.signal_group[i]] = int(math.floor((i - 1) / 10) + i)
print(self.signal_all)
self.parse_log()
self.stil_parse()
def parse_log(self):
with open(self.log_signal_file) as infile:
line_num = 0
blank_line = 0
for line in infile:
line_num += 1
if line_num == 1:
blank_line = line.count(' ')
if "------------------" in line:
break
for i in range(blank_line, len(line)):
self.log_signal[i - blank_line] += line[i]
for (key, value) in self.log_signal.items():
self.log_signal[key] = value.rstrip()
print(self.log_signal)
with open(self.log_signal_file) as log_in:
self.log_lines = log_in.read().splitlines()
for line in self.log_lines:
if re.match(self.pattern, line):
match = re.match(self.pattern, line)
cycle = int(match.group(9))
signals = match.group(11)
# print cycle,signals
self.mask_dict[cycle] = {}
for i in range(len(signals)):
if signals[i] != '.':
self.mask_dict[cycle][i] = signals[i]
def stil_parse(self):
cycle_keys = []
vector_num = 0
for i in self.mask_dict.keys():
cycle_keys.append(i)
with open(self.stil_name, 'r') as stil_in:
stil_in_list = stil_in.read().splitlines()
total_len = len(stil_in_list)
index_to_mask = 0
with tqdm(total=total_len, ncols=100, desc= " Stil Scanning in RAM Progress") as pbar:
for i_iter in range(total_len):
line = stil_in_list[i_iter]
pbar.update(1)
if "=" in line:
vector_num +=1
if (vector_num<=cycle_keys[-1]):
if (vector_num == cycle_keys[index_to_mask]):
line_find_equal = line.find("=")
line_find_coma = line.find(";")
status = line[line_find_equal + 1:line_find_coma]
# if cycle + self.offset in cycle_keys:
try:
match = 1
for (i, j) in self.mask_dict[vector_num].items():
mask_point = i
mask_signal = self.log_signal[i]
mask_value = j
test_point = self.signal_all[mask_signal]
test_value = status[test_point]
if test_value != mask_value:
print("data did not match for cycle: ", test_value, " VS ", line, j, vector_num, mask_point, mask_signal, test_point, test_value)
match = 0
raise NameError
else:
status = status[:test_point] + self.convert_value[self.mask_mode][test_value] + status[test_point + 1:]
stil_in_list[i_iter] = line[:line_find_equal + 1] + status + line[line_find_coma:]
# print("data change from :", line)
# print(" to:", stil_in_list[i_iter])
index_to_mask = index_to_mask+1
except (Exception) as e:
print("No matching for %d with %s" %(vector_num, line))
raise NameError
with tqdm(total=len(stil_in_list), ncols=100, desc= " Masked-stil to disk Progress") as pbar:
with open(self.stil_name_new, 'w') as stil_out:
for i_iter in range(len(stil_in_list)):
pbar.update(1)
stil_out.write(stil_in_list[i_iter]+ "\n")
import os,re
import math
from math import log10
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
python_file_root = './presidential_debates'
def getidf(token):
document_occurance = 0
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
for line in file:
if re.search(r'\b' +token+ r'\b', line):
document_occurance = document_occurance + 1
break
if (document_occurance != 0):
idf = log10(30 / document_occurance)
return idf
return -1
def normalize(filename,token):
file = open(os.path.join(python_file_root, filename), "r")
counts = dict()
square = []
count1 = 0
for line in file:
count1 = count1 + 1
if line in counts:
counts[line] += 1
else:
counts[line] = 1
for key,value in counts.items():
tf = 1 +log10(value)
idf = getidf(key.rstrip())
square.append((tf * idf)*(tf * idf))
summ = sum(square)
sqroot = math.sqrt(summ)
return sqroot
def getweight(filename,token):
hit_count1 = 0
final = 0
file = open(os.path.join(python_file_root, filename), "r")
idft = getidf(token)
for line in file:
if re.search(r'\b' +token+ r'\b', line):
hit_count1 = hit_count1 + 1
if (hit_count1 == 0):
return 0
else:
tf = 1 + log10(hit_count1)
initial = idft * tf
if(initial <= 0):
final = 0
return final
else:
normalize_fact = normalize(filename,token)
final = initial / normalize_fact
return final
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
doc = file.read()
doc = doc.lower()
stemmed = []
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
tokens = tokenizer.tokenize(doc)
stoplist = stopwords.words('english')
stop_removed = [word for word in tokens if word not in stoplist]
with open(os.path.join(python_file_root, filename), "w") as f:
for item in stop_removed:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(item)]
for items in stemmed:
f.write("%s\n" % items)
print("\nIDF\n")
print("%.12f" % getidf("health"))
print("%.12f" % getidf("agenda"))
print("%.12f" % getidf("vector"))
print("%.12f" % getidf("reason"))
print("%.12f" % getidf("hispan"))
print("%.12f" % getidf("hispanic"))
print("\n")
print("%.12f" % getweight("2012-10-03.txt","health"))
print("%.12f" % getweight("1960-10-21.txt","reason"))
print("%.12f" % getweight("1976-10-22.txt","agenda"))
print("%.12f" % getweight("2012-10-16.txt","hispan"))
print("%.12f" % getweight("2012-10-16.txt","hispanic"))
I have 30 txt files and i have developed a program to find the idf and normalized tf-idf vectors. Im getting the correct values but the function getweight takes more than 15 minutes to generate the output. Can anyone suggest me a few methods for optimization.
I donot want to use any other non-standard Python package.
Why do you create a new PorterStemmer for every word?
Apart from this obvious thing, try profiling your code. NLTI has the reputation of being really slow - so it may well be not your fault. If you profile, then you'll know.
thanks for the answer before and I have changed it what Alperen suggested, but I have another problem, my code :
import sys
import os
import itertools
import os.path
import random
from PIL import Image
from svmutil import *
DIMENSION = 200
sys.path.append("../train/")
ROOT_DIR = os.path.dirname(os.getcwd()) + "/train"
NEGATIVE = "negative"
POSITIVE = "positive"
CLASSES = [NEGATIVE, POSITIVE]
# libsvm constants
LINEAR = 0
RBF = 2
# Other
USE_LINEAR = False
IS_TUNING = False
def main():
try:
train, tune, test = getData(IS_TUNING)
models = getModels(train)
results = None
if IS_TUNING:
print ("!!! TUNING MODE !!!")
results = classify(models, tune)
else:
results = classify(models, test)
print
totalCount = 0
totalCorrect = 0
for clazz in CLASSES:
count, correct = results[clazz]
totalCount += count
totalCorrect += correct
print ("%s %d %d %f") % (clazz, correct, count, (float(correct) / count))
print ("%s %d %d %f") % ("Overall", totalCorrect, totalCount,(float(totalCorrect) / totalCount))
except Exception as e:
print (e)
return 5
def classify(models, dataSet):
results = {}
for trueClazz in CLASSES:
count = 0
correct = 0
for item in dataSet[trueClazz]:
predClazz, prob = predict(models, item)
print ("%s,%s,%f") % (trueClazz, predClazz, prob)
count += 1
if trueClazz == predClazz: correct += 1
results[trueClazz] = (count, correct)
return results
def predict(models, item):
maxProb = 0.0
bestClass = ""
for clazz, model in models.iteritems():
prob = predictSingle(model, item)
if prob > maxProb:
maxProb = prob
bestClass = clazz
return (bestClass, maxProb)
def predictSingle(model, item):
output = svm_predict([0], [item], model, "-q -b 1")
prob = output[2][0][0]
return prob
def getModels(trainingData):
models = {}
param = getParam(USE_LINEAR)
for c in CLASSES:
labels, data = getTrainingData(trainingData, c)
prob = svm_problem(labels, data)
m = svm_train(prob, param)
models[c] = m
return models
def getTrainingData(trainingData, clazz):
labeledData = getLabeledDataVector(trainingData, clazz, 1)
negClasses = [c for c in CLASSES if not c == clazz]
for c in negClasses:
ld = getLabeledDataVector(trainingData, c, -1)
labeledData += ld
random.shuffle(labeledData)
unzipped = [list(t) for t in zip(*labeledData)]
labels, data = unzipped[0], unzipped[1]
return (labels, data)
def getParam(linear = True):
param = svm_parameter("-q")
param.probability = 1
if(linear):
param.kernel_type = LINEAR
param.C = .01
else:
param.kernel_type = RBF
param.C = .01
param.gamma = .00000001
return param
def getLabeledDataVector(dataset, clazz, label):
data = dataset[clazz]
labels = [label] * len(data)
output = zip(labels, data)
return output
def getData(generateTuningData):
trainingData = {}
tuneData = {}
testData = {}
for clazz in CLASSES:
(train, tune, test) = buildTrainTestVectors(buildImageList(ROOT_DIR + clazz + "/"), generateTuningData)
trainingData[clazz] = train
tuneData[clazz] = tune
testData[clazz] = test
return (trainingData, tuneData, testData)
def buildImageList(dirName):
imgs = [Image.open(dirName + fileName).resize((DIMENSION, DIMENSION)) for fileName in os.listdir(dirName)]
imgs = [list(itertools.chain.from_iterable(img.getdata())) for img in imgs]
return imgs
def buildTrainTestVectors(imgs, generateTuningData):
# 70% for training, 30% for test.
testSplit = int(.7 * len(imgs))
baseTraining = imgs[:testSplit]
test = imgs[testSplit:]
training = None
tuning = None
if generateTuningData:
# 50% of training for true training, 50% for tuning.
tuneSplit = int(.5 * len(baseTraining))
training = baseTraining[:tuneSplit]
tuning = baseTraining[tuneSplit:]
else:
training = baseTraining
return (training, tuning, test)
if __name__ == "__main__":
sys.exit(main())
and I got the new massage
Klik this massage to see new error massage
What should I do? I have searched every answer but never make me get the answer. Now I use this code for my final project at university. I hope anyone can help me for this problem. But thank you for another last answer
EDIT:
This lines causes the error:
labeledData += ld
+= operand doesn't work for zips. You can change zips to list.
def getLabeledDataVector(dataset, clazz, label):
...
return list(output)
Also, unzipped list can be empty, you should fix this line too(Thanks to ShadowRanger for comment):
labels, data = unzipped if unzipped else ([], [])
This changes probably will affect your code's logic. You should fix them on your own.
BEFORE EDIT:
In getData(generateTuningData) function, ROOT_DIR + clazz expression causes the error, because ROOT_DIR is None.
sys.path.append doesn't return anything(returns None).
You need to change your code as:
...
import os.path
...
sys.path.append("../train/")
ROOT_DIR = os.path.dirname(os.getcwd()) + "/train/" # parent directory and "/train/"
...
I assumed ROOT_DIR is your current working directory's parent + "/train/". If it is not, you can fix it.
Also, there may be other problems, but this solves unsupported operand type(s).
The Code Below I wrote takes input from a sample file which contains First and Last names. Then it converts those names to sample emails. For some reason the Script keeps printing the same Last name over and over.
namess.txt looks like this:
firstname,lastname
CODE:
import os, re, time, getpass, linecache
Original = os.path.join(os.path.expanduser('~'), 'Desktop','namess.txt')
File = os.path.join(os.path.expanduser('~'), 'Desktop','output.txt')
badNames = []
Names = []
def RemCommas():
outfile = open(os.path.join('C:\\', 'Users', getpass.getuser(), 'Desktop','output.txt'),'w')
Filedata = open(Original).read()
outfile.write(re.sub(',', ' ', Filedata))
outfile.close()
def ClassNum():
count = 6
Year = int(time.strftime('%Y'))
Class = str((Year - 2013) + 6)
return Class
def ReadStoreFile():
i = 0
OpenFile = open(File)
LenFile = len(OpenFile.readlines())
while i < LenFile:
i += 1
badNames.append(linecache.getline(File, i))
def CleanNames():
i = 0
while i < len(badNames):
cleaned = badNames[i].rstrip()
Names.append(cleaned)
i += 1
def NamePrint():
Interns = 'makchessclub.org'
arrayname = []
i = 0
j = 0
m = 0
while m < len(Names):
Name = Names[m]
Name = Name.lower()
InternName = Name[0] + Name[1]
#------------Checking for space and first name--
while i < len(Name):
if Name[i] == ' ':
i = Name.index(' ')
break;
i += 1
#---------------adding last name in an array----
Namelen = len(Name) - (i+1)
while j < Namelen:
arrayname.append(Name[i+1])
j += 1
i += 1
#---------------Final Name Print----------------
Lastname = ''.join(arrayname)
#print arrayname
#Lastname = Lastname.strip(' ')
#print InternName + Lastname + ClassNum() + Interns
file = open('C:\\Users\\username\\Desktop\\emails.txt', 'a')
file.write(InternName + Lastname + ClassNum() + Interns + '\n')
file.close()
m += 1
RemCommas()
ReadStoreFile()
CleanNames()
NamePrint()
print ''
os.system('pause')
The reason the last name doesn't change is because you are not resetting arrayname in your loop. You keep appending names to it, and the program picks the first one. So you should put your arrayname = [] after the while m < len(Names):
I guess this what you are trying to do:
import os
import re
import time
def create_mails(input_path, output_path, year, addr):
with open(input_path, 'r') as data:
mail = re.sub(r'(\w+)\s*,\s*(\w+)\n?', r'\1\g<2>%s%s\n' % (year, addr), data.read())
with open(output_path, 'w') as output:
output.write(mail.lower())
print 'Mail addresses generated and saved to', output_path
Demo:
create_mails(
os.path.join(os.path.expanduser('~'), 'Desktop', 'namess.txt'),
os.path.join(os.path.expanduser('~'), 'Desktop', 'output.txt'),
str(int(time.strftime('%Y')) - 2013 + 6),
'#makchessclub.org'
)
If namess.txt is something like this:
First, Last
John,Doe
Spam, Ham
Cabbage, egg
Then output.txt is going to be like this:
firstlast6#makchessclub.org
johndoe6#makchessclub.org
spamham6#makchessclub.org
cabbageegg6#makchessclub.org