PYTHON How to count letters in words without special characters - python

I have a code that counts letters in words excluding special characters at the end. I just can't figure out a way to get it to exclude special character at the beginning also.
My code so far:
inFile = open( 'p.txt', "r" ).readlines()
myResults = []
for i in range( 20 ):
myResults.append( 0 )
mySpecialList = [ '-', '+', '#', '#', '!', '(', ')', '?', '.', ',', ':', ';', '"', "'", '`' ]
for line in inFile:
words = str.split( line )
for word in words:
if word not in mySpecialList:
if word[ -1 ] not in mySpecialList :
myResults[ len( word ) ] += 1
else :
myResults[ len( word ) - 1 ] += 1
print( myResults )

Here is some simple code to count all the alpha numeric letters of a single word.
word = "Hello World!"
count = 0
for c in word:
if c.isalnum():
count +1
print( count )
If you wanted to use your special characters you could adapt the code to look like
mySpecialList = ['*', '!']
word = "Hello World!"
count = 0
for c in word:
if c not in mySpecialList:
count +1
print( count )

You can use regular expressions, try it!
For example you can split string and after findall you have a list with all words.
import re
string = "Hello World, Hi + Say"
print(re.findall(r"[\w']+", string))

def reverseword(user_input):
words=str(user_input).split(" ")
newWords = [word[::-1] for word in words]
newSentence = " ".join(newWords)
return newSentence
if __name__ == "__main__":
while True:
ispresent=0
splcharlist=['-', '+', '#', '#', '!', '(', ')', '?', '.', ',', ':', ';', '"', "'", '`'," "]
user_input=input("Enter the input:")
print(len(user_input))
ccount=0
new_input=""
ch_count=0
if len(user_input)>100:
for eletter in user_input:
if eletter not in splcharlist:
ccount=ccount+1
ch_count=ch_count+1
if ccount>100:
break
new_input=user_input[:100]
else:
new_input=user_input
print("This is for your input:",user_input)
print("input with limit :"+str(new_input))
print(len(new_input))
print("The Reverse lists is: ",reverseword(new_input))
if "stop" in user_input:
break

Related

Transform a code tokens list into valid string code

I have written code to transform Python code into a list to compute BLEU score:
import re
def tokenize_for_bleu_eval(code):
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
return tokens
Thanks to this snippet my code struct.unpack('h', pS[0:2]) is parsed properly into the list ['struct', '.', 'unpack', '(', 'h', ',', 'p', 'S', '[', '0', ':', '2', ']', ')'].
Initially, I thought I need simply to use the ' '.join(list_of_tokens) but it kills my variable names like this struct . unpack ('h' , p S [ 0 : 2 ] ) and my code is not executable.
I tried to use Regex to stick some variable names but I can't succeed to reverse my function tokenize_for_bleu_eval to find executable code at the end. Is someone get an idea, perhaps without regex which seems to be too complicated here?
EDIT: We can't just remove all spaces between element of the list because there are examples like items = [item for item in container if item.attribute == value] where the result of the backtranslation without space would be itemforiteminaifitem[0]==1 which is not valid.
I am trying to merge the tokens using this script
import re
def tokenize_for_bleu_eval(code):
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
return tokens
def merge_tokens(tokens):
code = ''.join(tokens)
code = code.replace('`', "'")
code = code.replace(',', ", ")
return code
tokenize = tokenize_for_bleu_eval("struct.unpack('h', pS[0:2])")
print(tokenize) # ['struct', '.', 'unpack', '(', '`', 'h', '`', ',', 'p', 'S', '[', '0', ':', '2', ']', ')']
merge_result = merge_tokens(tokenize)
print(merge_result) # struct.unpack('h', pS[0:2])
Edit:
I found this interesting idea to tokenize and merge.
import re
def tokenize_for_bleu_eval(code):
tokens_list = []
codes = code.split(' ')
for i in range(len(codes)):
code = codes[i]
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
tokens_list.append(tokens)
if i != len(codes) -1:
tokens_list.append([' '])
flatten_list = []
for tokens in tokens_list:
for token in tokens:
flatten_list.append(token)
return flatten_list
def merge_tokens(flatten_list):
code = ''.join(flatten_list)
code = code.replace('`', "'")
return code
test1 ="struct.unpack('h', pS[0:2])"
test2 = "items = [item for item in container if item.attribute == value]"
tokenize = tokenize_for_bleu_eval(test1)
print(tokenize) # ['struct', '.', 'unpack', '(', '`', 'h', '`', ',', ' ', 'p', 'S', '[', '0', ':', '2', ']', ')']
merge_result = merge_tokens(tokenize)
print(merge_result) # struct.unpack('h', pS[0:2])
tokenize = tokenize_for_bleu_eval(test2)
print(tokenize) # ['items', ' ', '=', ' ', '[', 'item', ' ', 'for', ' ', 'item', ' ', 'in', ' ', 'container', ' ', 'if', ' ', 'item', '.', 'attribute', ' ', '=', '=', ' ', 'value', ']']
merge_result = merge_tokens(tokenize)
print(merge_result) # items = [item for item in container if item.attribute == value]
This script will also remember each space from the input

Concatenate some strings in a list

I have a list:
["i", "'", "ll", "get", "you", "in", "their", "call"]
And I want to convert it to:
["i'll", "get", "you", "in", "their", "call"]
So I need to concatenate words before and after apostrophes.
I tried to do it like this:
restored_text = ['i', "'", 'll', 'get', 'you', 'in', 'their', 'call']
restored_text_fixed = []
k = 0
i = 0
while i in (x for x in range(len(restored_text)) if k == 0):
print(i)
print('k', k)
if k > 0:
k = k - 1
if restored_text[i+1] == "'" and i > 0:
restored_text_fixed.append(restored_text[i] + "'" + restored_text[i+2])
k = 2
else:
restored_text_fixed.append(restored_text[i])
i += 1
But it seems that code doesn't work properly.
You can create an iterator from the list and concatenate the current item and the next item from the iterator with the previous item if the current item is an apostrophe:
restored_text = ["i", "'", "ll", "get", "you", "in", "their", "call"]
restored_text_fixed = []
i = iter(restored_text)
for s in i:
if s == "'":
restored_text_fixed[-1] += s + next(i)
else:
restored_text_fixed.append(s)
restored_text_fixed becomes:
["i'll", 'get', 'you', 'in', 'their', 'call']
Something that is a bit more in the flavour of the original post:
restored_text = ['i', "'", 'll', 'get', 'you', 'in', 'their', 'call']
restored_text_fixed = []
i = 0
while i < len(restored_text):
# catch situations where there is no next element, otherwise the
# next if clause will break the script
if i == len(restored_text)-1:
restored_text_fixed.append(restored_text[i])
break
if restored_text[i+1] == "'":
restored_text_fixed.append(restored_text[i] + "'" + restored_text[i+2])
i += 3
else:
restored_text_fixed.append(restored_text[i])
i += 1
print(restored_text_fixed)
Note that the other answers are considered a lot more pythonic, and that this looping over indices can usually be improved. Hopefully this can fix the bugs with your particular use case though
This might not work if you have space (" ") in one of the elements or if you have apostrophe between two spaces (" ` ") in the elements, but if this list is a result of some tokenization process you could use this simple approach -
Just join them back with spaces, replace " ` " with "`" and split back.
a = ["i", "'", "ll", "get", "you", "in", "their", "call"]
" ".join(a).replace(" ' ", "'").split(" ")
> ["i'll", 'get', 'you', 'in', 'their', 'call']
If you do have spaces but you absolutely sure you don't have other character (e.g. $) you can replace spaces with $ before and then replace them back to spaces afterwards.
restored_text = ["i", "'", "ll", "get", "you", "in", "their", "call"]
#preparing the array "restored_text_fixed" to make expected number of elements
j = 0
for item in restored_text:
if item == "'":
j +=1
j = j * 2 # because each apostrophe took 3 elements from array, like ['i', "'", 'll'] total 3 elements, after it become 1 element ['i'll']
restored_text_fixed = ["0" for x in range(len(restored_text)-(j))]
#the processes above to allow us respect the position of elements from old array "restored_text" to new array "restored_text_fixed"
p = 0
for itemPos in range(len(restored_text)):
if itemPos < len(restored_text)-1 and restored_text[itemPos+1] == "'":
restored_text_fixed[p] = restored_text[itemPos]+""+ restored_text[itemPos+1]+""+restored_text[itemPos+2]
p += 1
elif restored_text[itemPos-1] != "'" and restored_text[itemPos] != "'":
restored_text_fixed[p] = restored_text[itemPos]
p += 1
print(restored_text_fixed)
#OUTPUT
#["i'll", 'get', 'you', 'in', 'their', 'call', "get'in"]
#NOTE
#restored_text = ['i', "'", 'll', 'get', 'you', 'in', 'their', 'call', 'get', "'", 'in'] # add more elements to test if can work from any pos. it work
You can use the following line:
>>> l = ["i", "'", "ll", "get", "you", "in", "their", "call"]
>>> "'".join(map(str.strip, ' '.join(l).split("'"))).split()
["i'll", 'get', 'you', 'in', 'their', 'call']
Breaking it down:
l = ' '.join(l) # join words into sentence => "i ' ll get you in their call"
l = l.split("'") # split by quotation mark => ['i ', ' ll get you in their call']
l = map(str.strip, l) # strip off spaces => ['i', 'll get you in their call']
l = "'".join(l) # add back quotation mark => "i'll get you in their call"
l = l.split() # split back into words => ["i'll", 'get', 'you', 'in', 'their', 'call']

How do I save specific text in an array from .txt file

I have a .txt file, where I want to save only following characters "N", "1.1" ,"XY", "N", "2.3" ,"xz" in an array.
The .txt file looks like this:
[ TITLE
N 1.1 XY
N 2.3 XZ
]
Here is my code:
src = open("In.txt", "r")
def findOp (row):
trig = False
temp = ["", "", ""]
i = 1
n = 0
for char in row:
i += 1
if (char != '\t') & (char != ' ') & (char != '\n'):
trig = True
temp[n] += char
else:
if trig:
n += 1
trig = False
return temp
for line in src.readlines():
print(findOp(line))
The Output from my code is:
['[', 'TITLE', '']
['', '', '']
['N', '1.1', 'XY']
['N', '2.3', 'XZ']
['', '', '']
[']', '', '']
The problem is the program also saves whitespace characters in an array which i dont want.
I would recommend the trim()-function with witch one you can remove whitespace from a string
Whitespace on both sides:
s = s.strip()
Whitespace on the right side:
s = s.rstrip()
Whitespace on the left side:
s = s.lstrip()
You could check the return array before exiting:
def findOp(row):
trig = False
temp = ["", "", ""]
i = 1
n = 0
for char in row:
i += 1
if (char != '\t') & (char != ' ') & (char != '\n'):
trig = True
temp[n] += char
else:
if trig:
n += 1
trig = False
# Will return `temp` if all elements eval to True otherwise
# it will return None
return temp if all(temp) else None
The value None can then be used as a check condition in subsequent constructs:
for line in src.readlines():
out = findOp(line)
if out:
print(out)
>> ['N', '1.1', 'XY']
>> ['N', '2.3', 'XZ']
Try numpy.genfromtxt:
import numpy as np
text_arr = np.genfromtxt('In.txt', skip_header = 1, skip_footer = 1, dtype = str)
print(text_arr)
Output:
[['N' '1.1' 'XY']
['N' '2.3' 'XZ']]
Or if you want list, add text_arr.tolist()
Try this :
with open('In.txt', 'r') as f:
lines = [i.strip() for i in f.readlines() if i.strip()][1:-1]
output = [[word for word in line.split() if word] for line in lines]
Output :
[['N', '1.1', 'XY'], ['N', '2.3', 'XZ']]

Tokenize Function Not Working As Expected - Python

These are the instructions:
Write a function tokenize(input_string) that takes a string containing an expression and returns a list of tokens. Tokens in this small language will be delimited by whitespace, and so any time there a space (or several spaces in a row) in the input string, we want to split around that.
You should not use the built-in string operation split, but rather should structure your code using the tools we have developed so far.
When all is said and done, For example, running the tokenizer on this string:
tokenize("2 2 + 3 4 / .5 0.2 3.2 + - COS")
should return:
['2', '2', '+', '3', '4', '/', '.5', '0.2', '3.2', '+', '-', 'COS']
This is my code:
def tokenize(input_string):
tokens = []
token = ""
for char in input_string:
if char == " " and input_string[-1] != char and token != "":
tokens.append(token)
token = ""
elif input_string[-1] == char:
tokens.append(token + char)
elif char != " ":
token += char
return tokens
My code works properly with the given example and similar arguments, but when i run something like:
tokenize("pi load store load")
i get:
['pi', 'load', 'loa', 'store', 'load']
What's the bug? Tried finding it with print statements in various parts of the function to no avail. Also any advice on how to better organize the if statements will be greatly appreciated. Thanks in advance for the help.
I think your flaw is in the line elif input_string[-1] == char:.
If I'm understanding you correctly, you are trying to use this elif case to check if you are at the end of the string, and if you are, to add the last token in the string to your list of tokens.
However, if you have the last character in your string appear more than once, it will go into this case every time; that's why you have both 'loa' and 'load' in your list.
My suggestion is to remove all of your checks for the current character being the same as the last character in the string, and add
if token != "":
tokens.append(token)
after your for loop.
To add to the Izaak Weiss answer, please simplify your logic about the checks, this could be a solution:
def tokenize(input_string):
tokens = []
token = ''
for char in input_string:
if char == ' ': # Possible token termination
if token != '':
tokens.append(token)
token = ''
else:
token += char
# Last token
if token != '':
tokens.append(token)
return tokens
Here are 2 approaches:
The "plain" one that you were attempting to implement (tokenizing the string "manually")
A little bit more advanced one that uses [Python]: str.find(sub[, start[, end]]) (also rfind)
Of course there are others as well (e.g. ones that use recursion, or even regular expressions), but they probably are too advanced.
def tokenize_plain(input_string):
tokens = list()
current_token = ""
for char in input_string:
if char == " ":
if current_token:
tokens.append(current_token)
current_token = ""
else:
current_token += char
if current_token:
tokens.append(current_token)
return tokens
def tokenize_find(input_string):
tokens = list()
start = 0
end = input_string.find(" ", start)
while end != -1:
if end == start:
start += 1
else:
tokens.append(input_string[start: end])
start = end
end = input_string.find(" ", start)
end = input_string.rfind(" ", start)
if end == -1:
tokens.append(input_string[start:])
else:
tokens.append(input_string[start: end])
return tokens
if __name__ == "__main__":
for tokenize in [tokenize_plain, tokenize_find]:
for text in ["pi load store load", "2 2 + 3 4 / .5 0.2 3.2 + - COS"]:
print("{}('{}') = {}".format(tokenize.__name__, text, tokenize(text)))
Output:
c:\Work\Dev\StackOverflow\q46372240>c:\Work\Dev\VEnvs\py35x64_test\Scripts\python.exe a.py
tokenize_plain('pi load store load') = ['pi', 'load', 'store', 'load']
tokenize_plain('2 2 + 3 4 / .5 0.2 3.2 + - COS') = ['2', '2', '+', '3', '4', '/', '.5', '0.2', '3.2', '+', '-', 'COS']
tokenize_find('pi load store load') = ['pi', 'load', 'store', 'load']
tokenize_find('2 2 + 3 4 / .5 0.2 3.2 + - COS') = ['2', '2', '+', '3', '4', '/', '.5', '0.2', '3.2', '+', '-', 'COS']

Remove stopwords from sentences

I'd want to remove stopwords from a sentence.
I've this piece of code:
splitted = text.split()
for index, word in enumerate(splitted):
if word in self.stopWords:
del splitted[index]
text = " ".join(splitted)
stopWords is updated with this instruction self.stopWords.update(['.', ',', "\"", "\'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '),', '],', '},', '",', "',", '")', '"]', '"}', "-", "--", '\".', "\'.", '/', ').', '-', '--', '%', '°\'', '(-', '("', '."', '.),', ');', '–', '$', 'a']) but, for example, the letter 'a', such as '.', or ';' isn't deleted from the sentence.
What would I do?
I think it's easier to use list comprehension (or generator expression as I do here):
' '.join(w for w in text.split() if w not in stop_words)
Can you try my code? If you have any question about code,just ask to me.
def splitFile(lines,splitvalue):
documents={};
documentCount=1
dcmnt="";
for line in lines:
dcmnt+=line;
if (line.__contains__(splitvalue)):
key="documents"+(str)(documentCount);
documents[key]=dcmnt;
dcmnt="";
documentCount=documentCount+1;
return documents;
documentswords = []
with open('reuter10.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
documentswords.append(word)
stopwords=[]
with open('stopwords.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
stopwords.append(word)
readFile=open("reuter10.txt","r");
lines=readFile.readlines();
readFile.close();
alldocuments=splitFile(lines, "</reuters>");
temp=[]
for i in range(0,documentswords.__len__()):
count = 0;
for ii in range(0, stopwords.__len__()):
if documentswords[i]==stopwords[ii]:
count=count+1
if ii+1==stopwords.__len__() and count==0:
temp.append(documentswords[i])
print("")
print("*****PRINTING WORDS WITHOUT STOPWORDS*****")
print("")
for i in range(0, temp.__len__()): #printing words without stopwords
print(temp[i]);

Categories

Resources