Duplicate inserts to Database Table - python

My code works perfectly. What it does is it scans a text file and retrieves user defined information and stores it into access. The only issue I'm having is that is when the code is ran again, it inserts data that has already been inserted.
Is there an easy way to fix this? One thing that the data all have in common is that they all have a timestamp. Is it possible to use this to prevent a duplicate?
cur.execute("SELECT * FROM Main_Setup order by ID");
rows = cur.fetchall()
# outFileName = "out4.txt"
# The regex pattern that is used to extract timestamp from file
# it will search for timestamps like this 2017-06-13-22.31.30.978293
dateRegEx_1 = r"[0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{2}\.[0-9]{2}\.[0-9]{2}\.[0-9]+"
dateRegEx_2 = r"[0-9]{4}-[0-9]{2}-[0-9]{2} \/ [0-9]+:[0-9]+:[0-9]+"
# Compile the pattern
regdExPtrn_1 = re.compile(dateRegEx_1)
regdExPtrn_2 = re.compile(dateRegEx_2)
field_names = ''
# Call insertToAccess function to insert into access database
cur.execute('SELECT * FROM lastran order by ID')
tlran = cur.fetchall()
def insertLastran(JobName,timeStamp):
print(JobName,timeStamp)
def insertToAccess(JobName, TableSeq, timeStamp, accessTableValues, field_names):
# try:
params = (JobName, timeStamp, TableSeq, accessTableValues[0], accessTableValues[1], accessTableValues[2],
accessTableValues[3], accessTableValues[4], \
accessTableValues[5], accessTableValues[6], accessTableValues[7], accessTableValues[8],
accessTableValues[9], field_names)
cur.execute("INSERT INTO Report_Table (Job_Name,Run_TS,Seq_Num,Field1,Field2,Field3,Field4,Field5,Field6,Field7,Field8,Field9,Field10,Field11) VALUES \
(?,?,?,?,?,?,?,?,?,?,?,?,?,?)", params);
conn.commit()
# except:
# conn.rollback()
# Extract the current job fields
def field_Extract(fileLines, fieldsArray, JobName, timeStamp, delimit):
# Empty string in which we will append the
# extracted fields
matchStr = ""
count = 0
TableSeq = 0
accessTableValues = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
field_names = ''
for line in fileLines:
for field in fieldsArray:
if field in line:
key, value = line.split(delimit)
matchStr += key.strip() + "\t\t : " + value.strip() + "\n"
accessTableValues[count] = value.strip()
field_names += key.strip() + ';'
count += 1
if count == 10:
TableSeq += 1
insertToAccess(JobName, TableSeq, timeStamp, accessTableValues, field_names)
count = 0
accessTableValues = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
field_names = ''
if count > 0 and count < 10:
TableSeq += 1
insertToAccess(JobName, TableSeq, timeStamp, accessTableValues, field_names)
# Return the string hold the extracted fields
# Each field is onn a separate line
return matchStr
# Open input and output files
test_file = open(r'C:\Users\cqt7wny\Desktop\new\SAVERS_REPT_DT0712.txt', 'r+')
# outFile = open(outFileName, 'w')
# Initialize used variables
currentJobData = []
startAppending = False
currentJobFound = False # Gaurav note
fields_To_Extract = []
outFileStr = ""
for line in test_file:
if startAppending == False:
# for jobStart in job_start:
if currentJobFound == False:
# Find the job name for the current report and exit the loop #====##########===== Gaurav note
for rowx in rows:
if rowx[1] in line:
currentJobName = rowx[1]
search_Start_Point = rowx[2]
search_End_Point = rowx[3]
fields_To_Extract = rowx[4].split(';')
currentJobFound = True
break
if currentJobName == 'xx':
currentJobName = previousJobName
search_Start_Point = previous_search_Start_Point
search_End_Point = previous_search_End_Point
fields_To_Extract = previous_fields_To_Extract
if search_Start_Point in line:
startAppending = True
if startAppending == True:
currentJobData.append(line)
if len(search_End_Point) > 1 and (search_End_Point in line):
# As a job end found, stop gathering lines
startAppending = False
# Get the time stamp
# We search for it in the currnet line using the previously
# compiled regex pattern
txt = "".join(currentJobData)
# Find all occurance of timestamps on the current job lines
timeStamp = regdExPtrn_1.findall(txt)
# Check that a timestamp found
if len(timeStamp) >= 1:
# If there is more than one timestamp in the current
# job lines, get only the first one
timeStamp = timeStamp[0]
else:
timeStamp = regdExPtrn_2.findall(txt)
if len(timeStamp) >= 1:
timeStamp = timeStamp[0]
# Append the found output to the output string
outFileStr += '########============ NEW JOB STARTS HERE ===========#########'
outFileStr += "\n"
outFileStr += "job# " + str(currentJobName)
outFileStr += "\n"
outFileStr += "Timestamp: " + timeStamp
outFileStr += "\n"
outFileStr = field_Extract(currentJobData, fields_To_Extract, currentJobName, timeStamp, ':')
insertLastran(currentJobName,timeStamp)
print('Current job Name :', currentJobName, ' : ', timeStamp)
print(outFileStr)
previousJobName = currentJobName
previous_search_Start_Point = search_Start_Point
previous_search_End_Point = search_End_Point
previous_fields_To_Extract = fields_To_Extract
currentJobName = 'xx'
currentJobFound = False
currentJobData = []
fields_To_Extract = []
search_Start_Point = ' '
search_End_Point = ' '
test_file.close()

There can be different possible ways which can prevent duplicate inserts .
1.Check if already data is inserted/present in the Database/table.If it does not exists then Insert the data else ignore/do not insert the data .
2.You can Create a UNIQUE CONSTRAINT on which you think the duplicacy exist .
like ALTER TABLE MYTABLE ADD CONSTRAINT constraint1 UNIQUE(column1)

Related

Python - Write a new row for each list data under same header into csv

I have a text file, 'student.txt'. Some keys have multiple values. I only want data that is tied to the name, and the sibling & hobby values below that name.
'student.txt'
ignore me
name-> Alice
name-> Sam
sibling-> Kate,
unwanted
sibling-> Luke,
hobby_1-> football
hobby_2-> games
name-> Ramsay
hobby_1-> dance
unwanted data
hobby_2-> swimming
hobby_3-> jogging
ignore data
Code I've done:
file = open("student.txt", "r")
with open("student.csv", "w") as writer:
main_dict = {}
student_dict = {"Siblings": "N/A", "Hobbies": "N/A"}
sibling_list = []
hobby_list = []
flag = True
writer.write ('name,siblings,hobbies\n')
header = 'Name,Siblings,Hobbies'.split(',')
sib_str = ''
hob_str =''
for eachline in file:
try:
key, value = eachline.split("-> ")
value = value.strip(",\n")
if flag:
if key == "name":
print (key,value)
if len(sibling_list) > 0:
main_dict[name]["Siblings"] = sib_str
#print (main_dict)
if len(hobby_list) > 0:
main_dict[name]["Hobbies"] = hob_str
sibling_list = []
hobby_list = []
name = value
main_dict[name] = student_dict.copy()
main_dict[name]["Name"] = name
elif key == "sibling":
sibling_list.append(value)
sib_str= ' '.join(sibling_list).replace(' ', '\n')
elif key.startswith("hobby"):
hobby_list.append(value)
hob_str = ' '.join(hobby_list)
if len(sibling_list) > 0:
main_dict[name]["Siblings"] = sib_str
if len(hobby_list) > 0:
main_dict[name]["Hobbies"] = hob_str
if 'name' in eachline:
if 'name' in eachline:
flag = True
else:
flag = False
except:
pass
for eachname in main_dict.keys():
for eachkey in header:
writer.write(str(main_dict[eachname][eachkey]))
writer.write (',')
if 'Hobbies' in eachkey:
writer.write ('\n')
CSV Output from Code above:
Expected CSV Output:
P.S: I can't seem to figure out how to not forgo the try/pass. As some lines (without '->') are unwanted, and I can't use the eachline.split("-> "). Would appreciate help on this too.
Thanks so much!
The code below gives the csv file which you can import in your Excel and it will be in exact format you are expecting.
You can use something like
if "->" not in line:
continue
To skip lines that don't contain "->" values, see in the code below:
import csv
file = open("student.txt", "r")
students = {}
name = ""
for line in file:
if "->" not in line:
continue
line = line.strip(",\n")
line = line.replace(" ", "")
key, value = line.split("->")
if key == "name":
name = value
students[name] = {}
students[name]["siblings"] = []
students[name]["hobbies"] = []
else:
if "sibling" in key:
students[name]["siblings"].append(value)
elif "hobby" in key:
students[name]["hobbies"].append(value)
#print(students)
csvlines = []
for student in students:
name = student
hobbies = students[name]["hobbies"]
siblings = students[name]["siblings"]
maxlength = 0
if len(hobbies) > len(siblings) :
maxlength = len(hobbies)
else:
maxlength = len(siblings)
if maxlength == 0:
csvlines.append([name, "N/A", "N/A"])
continue
for i in range(maxlength):
if i < len(siblings):
siblingvalue = siblings[i]
elif i == len(siblings):
siblingvalue = "N/A"
else:
siblingvalue = ""
if i < len(hobbies):
hobbyvalue = hobbies[i]
elif i == len(siblings):
hobbyvalue = "N/A"
else:
hobbyvalue = ""
if i == 0:
csvlines.append([name, siblingvalue, hobbyvalue])
else:
csvlines.append(["", siblingvalue, hobbyvalue])
print(csvlines)
fields = ["name", "siblings", "hobbies"]
with open("students.csv", 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
# writing the data rows
csvwriter.writerows(csvlines)

why it say "the action cannot be completed because the file is open in python“?

def main_loop():
global errname, errtime, error_detail, conclusion
error_detail = ""
facts_all = {}
facts = []
buffer = 0
current_time = datetime.now()
while os.path.exists("C:\Winusr"):
print(paths["wintrace"])
try:
start_point = 0
old_size = os.path.getsize(paths["wintrace"])
while os.path.getsize(paths["wintrace"])>= old_size:
#fo = open(paths["wintrace"], "rb")
#fo.seek(start_point,1)
shutil.copyfile(paths["wintrace"], "C:\Winusr\wintrace1.log")
fo = open("C:\Winusr\wintrace1.log", "rb")
fo.seek(start_point, 1)
errtime = datetime(1900, 1, 1)
old_size = os.path.getsize(paths["wintrace"])
#start from here
for line in fo.readlines():
line = str(line.decode('ISO-8859-1'))
print(line)
if fnmatch.fnmatch(line, "*START DUMP LOG BUFFER*"):
buffer = 1
if fnmatch.fnmatch(line, "*END DUMP LOG BUFFER*"):
buffer = 0
if buffer == 1:
continue
facts_all = collect_facts(line,facts_all,key_string,key_value_dic)
for pattern in error_detect:
if fnmatch.fnmatch(line, pattern):
try:
err_type = df[df["Error Detect Keyword"] == pattern]["Err Type"].to_string(index=False).lstrip()
errname = df[df["Err Type"] == err_type]["Error Name"].tolist()[0].lstrip()
errtime = datetime.strptime(
datetime.fromtimestamp(os.path.getmtime(paths["wintrace"])).strftime("%Y-%m-%d") + " " + line[:8], "%Y-%m-%d %H:%M:%S") #"%d-%b-%Y %H:%M:%S"
#errtime = datetime.fromtimestamp(os.path.getmtime(paths["wintrace"])).strftime("%Y-%m-%d") + " " + line[:8]
#errtime = errtime.strftime('%Y-%m-%d %H:%M:%S')
product = re.findall(r"[/](.+?)[.]", paths["cur"])
product = product[0].split("/")[-1]
tester = tester_name(paths["cur"])
if len(facts_all) != 0:
facts.append(errname)
#idex = 9999
for fact, line in facts_all.items():
if fact in dic1[errname]:
error_detail = error_detail + line + '\n'
facts.append(fact)
print("err_detail1", error_detail)
if len(facts) != 1:
facts = list(set(facts))
conclusion = inference_engine(facts)
print("errtime", errtime)
print("current_time", current_time)
if conclusion != "cannot find solution for this error" and errtime > current_time:
solutions = sop2(errlist, errname, conclusion)
row = recording(tester, product, errname, errtime, error_detail, conclusion)
print("gg pop out GUI!!!")
#send_email(errname, errtime, tester, error_detail)
GUI(errname, errtime, error_detail, conclusion, solutions, row)
current_time = datetime.now()
workbook = xlrd.open_workbook(r"G:\expert system data\Machine Database.xls")
workbook1 = copy(workbook)
ws1 = workbook1.get_sheet(0)
style = xlwt.XFStyle()
style.num_format_str = 'yyyy-mm-dd hh:mm:ss'
ws1.write(row, 8, current_time, style)
workbook1.save(r"G:\expert system data\Machine Database.xls")
error_detail = ""
facts_all = {}
facts = []
error_detail = ""
facts_all = {}
facts = []
except:
continue
start_point = fo.tell()
fo.close()
except:
continue
else:
main_loop()
the paths["wintrace"] is ""C:\Winusr\Wintrace.log", i dont want it is open cause sometimes need to change its name or delete, i copy this file and open the copied one, but it still show it is open, can u help me check where it is opened? besides, i use "filepath = tkinter.filedialog.askopenfilename()", but dont think it will open the wintrace file.the error screenshot

Optimal substitute data structure to improve running time due to huge size of dictionary?

I have a python script where I initialize a dictionary containing around 4.9 million keys. Eack key has a list of 24 elements which I initialize to zero. I need to parse a text file containing around 9.7 million lines (20 columns each) and based on a specific match to the key of the dictionary, I increment the appropriate list integer of the key.
The problem is that the parsing is very slow and my job is getting killed(max 24 hr walltime on a cluster). The size of the dictionary to be initialized is around 200 Mb and after putting some time checks, I found that it takes around 16 minutes to parse 10,000 lines and hence it will take approx 242 hours to parse the entire 9.7 million lines
In short, I just need to count and increment the appropriate value of a dictionary key. Is there a substitute data structure for the python dictionary which can optimize this script and make it run in a reasonable amount of time?
def count_dict_init(file):
gff_file = open(file, 'r')
pos_list = []
for line in gff_file:
line_list = line.strip().split('\t')
if line.startswith('chr') and line[0:5] != 'chrmt':
if line_list[2] == 'CDS':
leftpos = int(line_list[3])
rightpos = int(line_list[4])
for position in range(leftpos - 100, rightpos + 101):
pos_list.append(position)
uniq_list = set(pos_list)
sorted_list = list(uniq_list)
sorted_list.sort()
pos_dict = {}
for pos in sorted_list:
pos_dict[pos] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '', '']
print 'Size of count dicitonary is ', sys.getsizeof(pos_dict)
return pos_dict
def sam_parser(sam_file, count):
dict_count = count
parsed_file = open('Sam_parsed_dict.tab', 'w')
non_cds_file = open('Non_Cds_file', 'w')
for line in sam_file:
if line[0] != '#':
fields = line.split('\t')
if len(fields) > 19:
multi_flag = fields[19].strip()
# If the read has more than one alignment then report it as multiple mapping
if multi_flag != 'NH:i:1':
multi_align = 'Y'
else:
multi_align = 'N'
else:
multi_align = 'N'
non_cds = False
sam_flag = int(fields[1])
chr_num = fields[2]
read_length = len(fields[9])
pos_in_value = (read_length - 27) * 2 #Determines which list position to update
if 27 <= read_length <= 37:
if sam_flag == 0: # Primary alignment on forward strand
five_prime = int(fields[3])
if five_prime in dict_count.keys():
dict_count[five_prime][pos_in_value] += 1
aligner_cis = dict_count[five_prime][22]
if aligner_cis == 'Y':
continue
else:
dict_count[five_prime][22] = multi_align
else:
non_cds = True
if sam_flag == 16: # On reverse strand
five_prime = int(fields[3]) + read_length - 1
if five_prime in dict_count.keys():
dict_count[five_prime][pos_in_value + 1] += 1
aligner_trans = dict_count[five_prime][23]
if aligner_trans == 'Y':
continue
else:
dict_count[five_prime][23] = multi_align
else:
non_cds = True
if sam_flag == 256: # Not primary alignment
five_prime = int(fields[3])
if five_prime in dict_count.keys():
aligner_cis = dict_count[five_prime][22]
if aligner_cis == 'Y':
continue
else:
dict_count[five_prime][22] = multi_align
else:
non_cds = True
if sam_flag == 272: # Not primary alignment and on reverse strand
five_prime = int(fields[3]) + read_length - 1
if five_prime in dict_count.keys():
aligner_trans = dict_count[five_prime][23]
if aligner_trans == 'Y':
continue
else:
dict_count[five_prime][23] = multi_align
else:
non_cds = True
if non_cds:
non_cds_file.write(str(chr_num)+'\t'+str(fields[3])+'\n')
for pos, counts in dict_count.iteritems():
parsed_file.write(str(pos)+'\t'+'\t'.join(map(str, counts))+'\n')
parsed_file.close()
non_cds_file.close()
if __name__ == "__main__":
# Parse arguments from commandline
arguments = parse_arguments()
GFF = arguments.gfffile
chrnum = arguments.chrnum
initial_count_dict = count_dict_init(GFF)
SAM = open(arguments.inputPath)
sam_parser(SAM, initial_count_dict)
I think your problem is this expression: if five_prime in dict_count.keys():
This creates a new list of every key in your dictionary (4.9M) and then walks through it linearly until the key is found (or it goes through the whole list if the key isn't found).
Since looking up a key in a dictionary takes 1 operation and looking it up in the list is 4.9M operations, you want to use this instead: if five_prime in dict_count:.
Another thing is that you are going the lookups several times more than you need to. If doing the lookup in the dictionary is in any way a bottleneck, you can minimize it by doing the lookup only once per iteration. Here's some sample code:
five_prime = int(fields[3])
record = dict_count.get(five_prime)
if record is not None:
record[pos_in_value] += 1
aligner_cis = record[22]
if aligner_cis == 'Y':
continue
else:
record[22] = multi_align
else:
non_cds = True

Extract values from string

I want to extract certain values from a string in python.
snp_1_881627 AA=G;ALLELE=A;DAF_GLOBAL=0.473901;GENE_TRCOUNT_AFFECTED=1;GENE_TRCOUNT_TOTAL=1;SEVERE_GENE=ENSG00000188976;SEVERE_IMPACT=SYNONYMOUS_CODON;TR_AFFECTED=FULL;ANNOTATION_CLASS=REG_FEATURE,SYNONYMOUS_CODON,ACTIVE_CHROM,NC_TRANSCRIPT_VARIANT,NC_TRANSCRIPT_VARIANT;A_A_CHANGE=.,L,.,.,.;A_A_LENGTH=.,750,.,.,.;A_A_POS=.,615,.,.,.;CELL=GM12878,.,GM12878,.,.;CHROM_STATE=.,.,11,.,.;EXON_NUMBER=.,16/19,.,.,.;GENE_ID=.,ENSG00000188976,.,ENSG00000188976,ENSG00000188976;GENE_NAME=.,NOC2L,.,NOC2L,NOC2L;HGVS=.,c.1843N>T,.,n.3290N>T,n.699N>T;REG_ANNOTATION=H3K36me3,.,.,.,.;TR_BIOTYPE=.,PROTEIN_CODING,.,PROCESSED_TRANSCRIPT,PROCESSED_TRANSCRIPT;TR_ID=.,ENST00000327044,.,ENST00000477976,ENST00000483767;TR_LENGTH=.,2790,.,4201,1611;TR_POS=.,1893,.,3290,699;TR_STRAND=.,-1,.,-1,-1
Output:
GENE_ID GENE_NAME EXON_NUMBER SEVERE_IMPACT
snp_1_881627 ENSG00000188976 NOC2L 16/19 SYNONYMOUS_CODON
If the string has values for each of those variables(GENE_ID,GENE_NAME,EXON_NUMBER) existing then output, else "NA"(variables don't exist or their values don't exist).In some cases,these variables don't exist in the string.
Which string method should I use to accomplish this?Should I split my string before extracting any values?I have 10k rows to extract values for each snp_*
string=string.split(';')
P.S. I am a newbie in python
There are two general strategies for this - split and regex.
To use split, first split off the row label (snp_1_881627):
rowname, data = row.split()
Then, you can split data into the individual entries using the ; separator:
data = data.split(';')
Since you need to get the value of certain keys, we can turn it into a dictionary:
dataDictionary = {}
for entry in data:
entry = entry.split('=')
dataDictionary[entry[0]] = entry[1] if len(entry) > 1 else None
Then you can simply check if the keys are in dataDictionary, and if so grab their values.
Using split is nice in that it will index everything in the data string, making it easy to grab whichever ones you need.
If the ones you need will not change, then regex might be a better option:
>>> import re
>>> re.search('(?<=GENE_ID=)[^;]*', 'onevalue;GENE_ID=SOMETHING;othervalue').group()
'SOMETHING'
Here I'm using a "lookbehind" to match one of the keywords, then grabbing the value from the match using group(). Putting your keywords into a list, you could find all the values like this:
import re
...
keywords = ['GENE_ID', 'GENE_NAME', 'EXON_NUMBER', 'SEVERE_IMPACT']
desiredValues = {}
for keyword in keywords:
match = re.search('(?<={}=)[^;]*'.format(keyword), string_to_search)
desiredValues[keyword] = match.group() if match else DEFAULT_VALUE
I think this is going to be the solution you are looking for.
#input
user_in = 'snp_1_881627 AA=G;ALLELE=A;DAF_GLOBAL=0.473901;GENE_TRCOUNT_AFFECTED=1;GENE_TRCOUNT_TOTAL=1;SEVERE_GENE=ENSG00000188976;SEVERE_IMPACT=SYNONYMOUS_CODON;TR_AFFECTED=FULL;ANNOTATION_CLASS=REG_FEATURE,SYNONYMOUS_CODON,ACTIVE_CHROM,NC_TRANSCRIPT_VARIANT,NC_TRANSCRIPT_VARIANT;A_A_CHANGE=.,L,.,.,.;A_A_LENGTH=.,750,.,.,.;A_A_POS=.,615,.,.,.;CELL=GM12878,.,GM12878,.,.;CHROM_STATE=.,.,11,.,.;EXON_NUMBER=.,16/19,.,.,.;GENE_ID=.,ENSG00000188976,.,ENSG00000188976,ENSG00000188976;GENE_NAME=.,NOC2L,.,NOC2L,NOC2L;HGVS=.,c.1843N>T,.,n.3290N>T,n.699N>T;REG_ANNOTATION=H3K36me3,.,.,.,.;TR_BIOTYPE=.,PROTEIN_CODING,.,PROCESSED_TRANSCRIPT,PROCESSED_TRANSCRIPT;TR_ID=.,ENST00000327044,.,ENST00000477976,ENST00000483767;TR_LENGTH=.,2790,.,4201,1611;TR_POS=.,1893,.,3290,699;TR_STRAND=.,-1,.,-1,-1'
#set some empty vars
user_in = user_in.split(';')
final_output = ""
GENE_ID_FOUND = False
GENE_NAME_FOUND = False
EXON_NUMBER_FOUND = False
GENE_ID_OUTPUT = ''
GENE_NAME_OUTPUT = ''
EXON_NUMBER_OUTPUT = ''
SEVERE_IMPACT_OUTPUT = ''
for x in range(0, len(user_in)):
if x == 0:
first_line_count = 0
first_line_print = ''
while(user_in[0][first_line_count] != " "):
first_line_print += user_in[0][first_line_count]
first_line_count += 1
final_output += first_line_print + "\t"
else:
if user_in[x][0:11] == "SEVERE_GENE":
GENE_ID_OUTPUT += user_in[x][12:] + "\t"
GENE_ID_FOUND = True
if user_in[x][0:9] == "GENE_NAME":
GENE_NAME_OUTPUT += user_in[x][10:] + "\t"
GENE_NAME_FOUND = True
if user_in[x][0:11] == "EXON_NUMBER":
EXON_NUMBER_OUTPUT += user_in[x][12:] + "\t"
EXON_NUMBER_FOUND = True
if user_in[x][0:13] == "SEVERE_IMPACT":
SEVERE_IMPACT_OUTPUT += user_in[x][14:] + "\t"
if GENE_ID_FOUND == True:
final_output += GENE_ID_OUTPUT
else:
final_output += "NA"
if GENE_NAME_FOUND == True:
final_output += GENE_NAME_OUTPUT
else:
final_output += "NA"
if EXON_NUMBER_FOUND == True:
final_output += EXON_NUMBER_OUTPUT
else:
final_output += "NA"
final_output += SEVERE_IMPACT_OUTPUT
print(final_output)

Why is my Code Printing the same Last Name?

The Code Below I wrote takes input from a sample file which contains First and Last names. Then it converts those names to sample emails. For some reason the Script keeps printing the same Last name over and over.
namess.txt looks like this:
firstname,lastname
CODE:
import os, re, time, getpass, linecache
Original = os.path.join(os.path.expanduser('~'), 'Desktop','namess.txt')
File = os.path.join(os.path.expanduser('~'), 'Desktop','output.txt')
badNames = []
Names = []
def RemCommas():
outfile = open(os.path.join('C:\\', 'Users', getpass.getuser(), 'Desktop','output.txt'),'w')
Filedata = open(Original).read()
outfile.write(re.sub(',', ' ', Filedata))
outfile.close()
def ClassNum():
count = 6
Year = int(time.strftime('%Y'))
Class = str((Year - 2013) + 6)
return Class
def ReadStoreFile():
i = 0
OpenFile = open(File)
LenFile = len(OpenFile.readlines())
while i < LenFile:
i += 1
badNames.append(linecache.getline(File, i))
def CleanNames():
i = 0
while i < len(badNames):
cleaned = badNames[i].rstrip()
Names.append(cleaned)
i += 1
def NamePrint():
Interns = 'makchessclub.org'
arrayname = []
i = 0
j = 0
m = 0
while m < len(Names):
Name = Names[m]
Name = Name.lower()
InternName = Name[0] + Name[1]
#------------Checking for space and first name--
while i < len(Name):
if Name[i] == ' ':
i = Name.index(' ')
break;
i += 1
#---------------adding last name in an array----
Namelen = len(Name) - (i+1)
while j < Namelen:
arrayname.append(Name[i+1])
j += 1
i += 1
#---------------Final Name Print----------------
Lastname = ''.join(arrayname)
#print arrayname
#Lastname = Lastname.strip(' ')
#print InternName + Lastname + ClassNum() + Interns
file = open('C:\\Users\\username\\Desktop\\emails.txt', 'a')
file.write(InternName + Lastname + ClassNum() + Interns + '\n')
file.close()
m += 1
RemCommas()
ReadStoreFile()
CleanNames()
NamePrint()
print ''
os.system('pause')
The reason the last name doesn't change is because you are not resetting arrayname in your loop. You keep appending names to it, and the program picks the first one. So you should put your arrayname = [] after the while m < len(Names):
I guess this what you are trying to do:
import os
import re
import time
def create_mails(input_path, output_path, year, addr):
with open(input_path, 'r') as data:
mail = re.sub(r'(\w+)\s*,\s*(\w+)\n?', r'\1\g<2>%s%s\n' % (year, addr), data.read())
with open(output_path, 'w') as output:
output.write(mail.lower())
print 'Mail addresses generated and saved to', output_path
Demo:
create_mails(
os.path.join(os.path.expanduser('~'), 'Desktop', 'namess.txt'),
os.path.join(os.path.expanduser('~'), 'Desktop', 'output.txt'),
str(int(time.strftime('%Y')) - 2013 + 6),
'#makchessclub.org'
)
If namess.txt is something like this:
First, Last
John,Doe
Spam, Ham
Cabbage, egg
Then output.txt is going to be like this:
firstlast6#makchessclub.org
johndoe6#makchessclub.org
spamham6#makchessclub.org
cabbageegg6#makchessclub.org

Categories

Resources