Text file indexing using python 3.4.3 - python
I try to write a Python 3.4 code to index text document from external
and this my attempt. when run it error message:
raw input is not defined
What I want is:
to tokenize the document which is out of python 34 folder
to remove stop words
to stem
indexing
The code:
import string
def RemovePunc():
line = []
i = 0
text_input = ""
total_text_input = "C:Users\Kelil\Desktop\IRS_Assignment\project.txt"
#This part removes the punctuation and converts input text to lowercase
while i != 1:
text_input = raw_input
if text_input == ".":
i = 1
else:
new_char_string = ""
for char in text_input:
if char in string.punctuation:
char = " "
new_char_string = new_char_string + char
line = line + [new_char_string.lower()]
#This is a list with all of the text that was entered in
total_text_input = (total_text_input + new_char_string).lower()
return line
def RemoveStopWords(line):
line_stop_words = []
stop_words = ['a','able','about','across','after','all','almost','also','am','among',
'an','and','any','are','as','at','be','because','been','but','by','can',
'cannot','could','dear','did','do','does','either','else','ever','every',
'for','from','get','got','had','has','have','he','her','hers','him','his',
'how','however','i','if','in','into','is','it','its','just','least','let',
'like','likely','may','me','might','most','must','my','neither','no','nor',
'not','of','off','often','on','only','or','other','our','own','rather','said',
'say','says','she','should','since','so','some','than','that','the','their',
'them','then','there','these','they','this','tis','to','too','twas','us',
'wants','was','we','were','what','when','where','which','while','who',
'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your']
#this part removes the stop words for the list of inputs
line_stop_words = []
sent = ""
word = ""
test = []
for sent in line:
word_list = string.split(sent)
new_string = ""
for word in word_list:
if word not in stop_words:
new_string = new_string + word + " "
new_string = string.split(new_string)
line_stop_words = line_stop_words + [new_string]
return(line_stop_words)
def StemWords(line_stop_words):
leaf_words = "s","es","ed","er","ly","ing"
i=0
while i < 6:
count = 0
length = len(leaf_words[i])
while count < len(line_stop_words):
line = line_stop_words[count]
count2 = 0
while count2 < len(line):
#line is the particular list(or line) that we are dealing with, count if the specific word
if leaf_words[i] == line[count2][-length:]:
line[count2] = line[count2][:-length]
count2 = count2 + 1
line_stop_words[count] = line
count2 = 0
count = count + 1
count = 0
i = i + 1
return(line_stop_words)
def indexDupe(lineCount,occur):
if str(lineCount) in occur:
return True
else:
return False
def Indexing(line_stop_words):
line_limit = len(line_stop_words)
index = []
line_count = 0
while line_count < line_limit:
for x in line_stop_words[line_count]:
count = 0
while count <= len(index):
if count == len(index):
index = index + [[x,[str(line_count+1)]]]
break
else:
if x == index[count][0]:
if indexDupe(line_count+1,index[count][1]) == False:
index[count][1] += str(line_count+1)
break
count = count + 1
line_count = line_count + 1
return(index)
def OutputIndex(index):
print ("Index:")
count = 0
indexLength = len(index)
while count < indexLength:
print (index[count][0],)
count2 = 0
lineOccur = len(index[count][1])
while count2 < lineOccur:
print (index[count][1][count2],)
if count2 == lineOccur -1:
print ("")
break
else:
print (",",)
count2 += 1
count += 1
line = RemovePunc()
line_stop_words = RemoveStopWords(line)
line_stop_words = StemWords(line_stop_words)
index = Indexing(line_stop_words)
OutputIndex(index)
#smichak already put the right answer in the comments. raw_input was renamed to input in Python 3. So you want:
text_input = input()
Don't forget those parentheses, since you want to call the function.
Related
How to replace an old value in a string with a new one?
So the problem states that if the given word in a sentence begins and ends with the same character I remove that character and keep on doing that until they are not the same anymore or their length is less than 3. Example: aabaa -> aba -> b And that is not the problem, so now I should replace the word 'aabaa' with 'b' in original string. The only problem i have is when the sentence is given without spaces. For example: Trust,is,all. -> rus,is,all. Also to note characters such as (. , ! ? ; :) are to be ignored but have to be there in the final output. So far I've wrote this but it doesn't satisfy the example above: s1 = str(input()) sentence = s1.split() news = [] ignorabelCharacters = ['.',',','!','?',';',':'] helpList = [] for i in range(len(s1)): if s1[i] in ignorabelCharacters: helpList.append([s1[i],i]) for i in sentence: i = str(i) j = 0 while j < len(i): if i[j] in ignorabelCharacters: i = i.replace(i[j],' ').strip() j+=1 else:j+=1 news.append(i) s2 =' '.join(news) newSentence = s2.split() def checkAgain(newSentence): newNewSentance = [] count = [] x=0 for i in newSentence: j = len(i) while j > 2: if i[0].lower() == i[-1] or i[0].upper() == i[-1]: i = i[1:-1] j-=2 x+=2 else: break count.append(x) newNewSentance.append(i) x=0 return [newNewSentance,count] newNewSentence = checkAgain(newSentence)[0] count = checkAgain(newSentence)[1] def finalProcessing(newNewSentence,sentence,newSentence): finalSentence = [] for i in range(len(sentence)): if len(newNewSentence[i]) == len(sentence[i]): finalSentence.append(newNewSentence[i]) else: x = len(sentence[i]) - len(newSentence[i]) if x ==0: finalSentence.append(newNewSentence[i]) else: value = newNewSentence[i] + sentence[i][-x:] finalSentence.append(value) return finalSentence finalSentence = finalProcessing(newNewSentence,sentence,newSentence) def finalPrint(finalSentece): for i in range(len(finalSentece)): if i == len(finalSentece) -1: print(finalSentece[i],end='') else: print(finalSentece[i],end= ' ') finalPrint(finalSentence)
This approach is different from yours but here is how I would do it. def condenseWord(w): ln = len(w) while ln >= 3: if w[0].lower() == w[-1].lower(): w = w[1:-1] ln = len(w) else: break return w def collapseString(s): sep_chars = {' ', ',', '.', ';', ':', '!', '"', "'"} out = r'' wdstrt = 0 lstltr = 0 for i in range(len(s)): if s[i] in sep_chars: ck = condenseWord(s[wdstrt:i]) out += ck out += s[i] wdstrt = i+1 lstltr = i+1 else: lstltr = i out += s[wdstrt:lstltr] return out Then collapseString('aabaa') -> 'b', and collapseString('Trust,is,all.' ) -> 'rus,is,all.'
Trying to reverse letter order in words with 5 or more letters in a string. This code does not crash but does not output anything either
def spin(s): for word in s: if len(word) >= 5: w = ' '.join(w[::-1] for w in s.split()) return w print(w) s = "Twist and shout" spin(s) desired output: "tsiwT and tuohs"
def spin(s): return ' '.join(word[::-1] if len(word) >= 5 else word for word in s.split()) print(spin("Twist and shout"))
def reverse(x): x_list = list(s.split(" ")) final_list = [] for word in x_list: if len(word)>=5: word = word[::-1] final_list.append(word) else: final_list.append(word) final_word = " ".join(final_list) return final_word
placeholders instead of split operation. just to show use of placeholders string = "Twist and shout"; count = 0; temp = 0; array = []; for element in range(0, len(string)): count = count + 1; if string[element] is " ": array.append(string[temp:count-1]); if len(array[-1]) > 4: array[-1] = (array[-1][::-1]) temp = count; array.append(string[temp:count]); if len(array[-1]) > 4: array[-1] = (array[-1][::-1])
How do I make my decrypt code work?
I created this code that encrypts text but when it tries to decrypt I get a: builtins.ValueError: chr() arg not in range(0x110000) Any help in making the decryption code work properly would be much appreciated! input1 = input("enter key word: ") input2 = input1[:1] key = ord(input2) num_count = 32 dic= {} while num_count <= 126: resultant = float(num_count*key*565) dic[num_count] = resultant num_count += 1 string = input("Please enter text: ") num_list = "" for i in (string): number = int(ord(i)) value = (dic[number]) number_value = str(value) final_value = number_value+" " num_list += final_value print("Encrypted text is", num_list) num_count3 = 0 decrypt_text = "" string_len = len(num_list) characters = [] localChar = "" for i in num_list: if i != " ": localChar = localChar + i elif i == " ": characters.append(localChar) localChar = "" num_count3 = 0 list_len = len(characters) while num_count3 < list_len: value = float(characters[num_count3]) valuel = int(value/key/54734) value2 = round(value) de_char = chr(value2) decrypt_text += de_char num_count3 += 1 print(decrypt_text)
going to be honest, code was all over. But I hope this helps. Issue: num_count3 = 0 first instance not use string_len = len(num_list) not used int(value/key/54734) should be round(value/key/565) < your issue + value2 = round(value) should be value2 = int(valuel) And lots of clean up + Functions are great! def encrypt(key, input1): num_count = 32 dic = {i:float(i*key*565) for i in range(num_count,127)} string = input("Please enter text: ") num_list = ' '.join([str(dic[ord(i)]) for i in string]) print("Encrypted text is", num_list) return num_list def decrypt(key, num_list): characters = num_list.split() decrypt_text = "" num_count3 = 0 list_len = len(characters) while num_count3 < list_len: value = float(characters[num_count3]) valuel = (value/key/565) value2 = int(round(valuel)) de_char = chr(value2) decrypt_text += de_char num_count3 += 1 print(decrypt_text) if __name__ == "__main__": input1 = input("enter key word: ") key = ord(str(input1[:1])) print key result = encrypt(key, input1) decrypt(key, result)
How to count the amount of vowels and consonants in a text file?
I am trying to correctly count the number of vowels and consonants in a text file but I am lost currently. I have the other parts that need to be found done. # Home work 4 from string import punctuation fname = raw_input("Enter name of the file: ") fvar = open(fname, "r") punctuationList = "!#$%&'(),.:;?" numLines = 0 numWords = 0 numChars = 0 numPunc = 0 numVowl = 0 numCons = 0 if line in "aeiou": numVowl = + 1 else: numCons += 1 for line in fvar: wordsList = line.split() numLines += 1 numWords += len(wordsList) numChars += len(line) for punctuation in punctuationList: numPunc += 1 print "Lines %d" % numLines print "Words %d" % numWords print "The amount of charcters is %d" % numChars print "The amount of punctuation is %d" % numPunc print "The amount of vowls is %d" % numVowl print "The amount of consonants is %d" % numCons
You need to loop over all the characters in the line, testing whether they're vowels, consonants, or punctuation. for line in fvar: wordsList = line.split() numLines += 1 numWords += len(wordsList) numChars += len(line) for char in line: if char in 'aeiou': numVowl += 1 elif char in 'bcdfghjklmnpqrstvwxyz' numCons += 1 else: numPunc += 1
You can try this: f = [i.strip('\n').split() for i in open('file.txt')] new_lines = [[sum(b in 'bcdfghjklmnpqrstvwxyz' for b in i), sum(b in "aeiou" for b in i)] for i in f] total_consonants = sum(a for a, b in new_lines) total_vowels = sum(b for a, b in new_lines)
I would write a function that returns a 3-tuple of the counts you care about when given a string. import string def count_helper(s) -> ("vowel count", "consonant count", "punctuation count"): vowels = set('aeiou') consonants = set(string.ascii_lowercase).difference(vowels) # you could also do set('bcdfghjklmnpqrstvwxyz'), but I recommend this approach # because it's more obviously correct (you can't possibly typo and miss a letter) c_vowel = c_consonant = c_punctuation = 0 for ch in s: if ch in vowels: c_vowel += 1 elif ch in consonants: c_consonant += 1 else: c_punctuation += 1 return (c_vowel, c_consonant, c_punctuation) Then as you iterate through the file, pass each line to count_helper. counts = {'vowels': 0, 'consonants': 0, 'punctuation': 0} for line in f: v, c, p = count_helper(line) counts['vowels'] += v counts['consonants'] += c counts['punctuation'] += p
dynamically creating keys and values in a Python dictionary
The problem I am trying to solve is reading in a file in that contains a list of words. Then counting the number of vowels in each word and display each word in a table along with the number of its vowels and the total vowels in the word, and at the end display the total number of vowels in all of the words. I am trying to solve the problem by reading the file in through a for loop and creating a dictionary that is associated with every word like mississippi['a_count' : 0, 'e_ocunt' : 0, 'i_count' : 4 ,'o_count' : 0, 'u_count' : 0, 'y_count' : 0] My problem is that I am not sure how to create the dictionaries as the variable changes due to a loop. I am just ending up with empty dictionaries. here's a screenshot of my output http://imgur.com/mksgdTc my test code in the file is Mississippi California Wisconsin all on different lines. try: word_file = open("vowel.txt", "r") count = 0 dic = {} a_count = 0 e_count = 0 i_count = 0 o_count = 0 u_count = 0 y_count = 0 total_count = 0 #this establishes the top of the table print('Number','{:>8}'.format('word'),'{:>8}'.format('A'),'{:>4}'.format('E'),'{:>4}'.format('I'),'{:>4}'.format('O'),'{:>4}'.format('U'),'{:>4}'.format('Y'),'{:>8}'.format('Total')) print("__________________________________________________________") for word in word_file: count+=1 word = {} print(word) word_a_count = 0 word_e_count = 0 word_i_count = 0 word_o_count = 0 word_u_count = 0 word_y_count = 0 word_total_count = 0 for letters in word: print(letters) if letters.lower() == "a": a_count+= 1 total_count += 1 word_a_count +=1 word['a_count'] = word_a_count if letters.lower() == "e": e_count+= 1 total_count += 1 word_e_count +=1 word['e_count'] = word_e_count if letters.lower() == "i": i_count+= 1 total_count += 1 word_i_count +=1 word['i_count'] = word_i_count if letters.lower() == "o": o_count+= 1 total_count += 1 word_o_count +=1 word['o_count'] = word_o_count if letters.lower() == "u": u_count+= 1 total_count += 1 word_u_count +=1 word['u_count'] = word_u_count if letters.lower() == "y": y_count+= 1 total_count += 1 word_y_count +=1 word['y_count'] = word_y_count print('Totals','{:>8}'.format(' '),'{:>8}'.format(word['a_count']),'{:>4}'.format\ (word['e_count']),'{:>4}'.format(word['i_count']),'{:>4}'.format\ (word['o_count']),'{:>4}'.format(word['u_count']),'{:>4}'.\ format(word['y_count'])) #this creates the bottom barrier of the table print("__________________________________________________________") #code for totals print print('Totals','{:>8}'.format(' '),'{:>8}'.format(a_count),'{:>4}'.format(e_count),'{:>4}'.format(i_count),'{:>4}'.format(o_count),'{:>4}'.format(u_count),'{:>4}'.format(y_count),'{:>6}'.format(total_count)) except IOError: print("The file does not seem to exists. The program is halting.")
Focus on this section -- word is re-assigned as an empty dict on every iteration of the loop: for word in word_file: count+=1 word = {} However, commenting word = {} out now throws an error when the first vowel is read from file (since now the dict isn't empty). Remember that word is the current line in the text file that you are iterating over, so word['u_count'] = word_u_count is interpreted as an instruction to change a character in the string. Python strings are immutable, so an error is thrown. Your program is much longer than it needs to be - when you notice repetition in your code consider refactoring to take advantage of loops and iteration, to make your program more concise. You could separate all the logic for counting the letters in a word into one procedure: def countletters(word, letterstocount): count = {} word = word.lower() for char in word: if char in letterstocount: if char in count: count[char] += 1 else: count[char] = 1 return count #example call vowels = "aeiou" print(countletters('Arizona', vowels)) which you then call for each word in your file.
In Python 2 I'd do something like this... #! /usr/bin/env python ''' Count vowels in a list of words & show a grand total Words come from a plain text file with one word per line ''' import sys vowels = 'aeiouy' def make_count_dict(): ''' Create a dict for counting vowels with all values initialised to 0 ''' return dict(zip(vowels, (0,)*len(vowels))) def get_counts(d): return ' '.join('%2d' % d[k] for k in vowels) def count_vowels(wordlist): hline = '_'*45 print '%3s: %-20s: %s' % ('Num', 'Word', ' '.join('%2s' % v for v in vowels)) print hline total_counts = make_count_dict() for num, word in enumerate(wordlist, start=1): word_counts = make_count_dict() for ch in word.lower(): if ch in vowels: word_counts[ch] += 1 total_counts[ch] += 1 print '%3d: %-20s: %s' % (num, word, get_counts(word_counts)) print hline print '%-25s: %s' % ('Total', get_counts(total_counts)) def main(): fname = len(sys.argv) > 1 and sys.argv[1] if fname: try: with open(fname, 'r') as f: wordlist = f.read().splitlines() except IOError: print "Can't find file '%s'; aborting." % fname exit(1) else: wordlist = ['Mississippi', 'California', 'Wisconsin'] count_vowels(wordlist) if __name__ == '__main__': main()