Remove stopwords from sentences

Remove stopwords from sentences - python

I'd want to remove stopwords from a sentence.
I've this piece of code:
splitted = text.split()
for index, word in enumerate(splitted):
if word in self.stopWords:
del splitted[index]
text = " ".join(splitted)
stopWords is updated with this instruction self.stopWords.update(['.', ',', "\"", "\'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '),', '],', '},', '",', "',", '")', '"]', '"}', "-", "--", '\".', "\'.", '/', ').', '-', '--', '%', '°\'', '(-', '("', '."', '.),', ');', '–', '$', 'a']) but, for example, the letter 'a', such as '.', or ';' isn't deleted from the sentence.
What would I do?

I think it's easier to use list comprehension (or generator expression as I do here):
' '.join(w for w in text.split() if w not in stop_words)

Can you try my code? If you have any question about code,just ask to me.
def splitFile(lines,splitvalue):
documents={};
documentCount=1
dcmnt="";
for line in lines:
dcmnt+=line;
if (line.__contains__(splitvalue)):
key="documents"+(str)(documentCount);
documents[key]=dcmnt;
dcmnt="";
documentCount=documentCount+1;
return documents;
documentswords = []
with open('reuter10.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
documentswords.append(word)
stopwords=[]
with open('stopwords.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
stopwords.append(word)
readFile=open("reuter10.txt","r");
lines=readFile.readlines();
readFile.close();
alldocuments=splitFile(lines, "</reuters>");
temp=[]
for i in range(0,documentswords.__len__()):
count = 0;
for ii in range(0, stopwords.__len__()):
if documentswords[i]==stopwords[ii]:
count=count+1
if ii+1==stopwords.__len__() and count==0:
temp.append(documentswords[i])
print("")
print("*****PRINTING WORDS WITHOUT STOPWORDS*****")
print("")
for i in range(0, temp.__len__()): #printing words without stopwords
print(temp[i]);

Related

How to extract a specific line out of a text file

I have the code from the attached picture in a .can-file, which is in this case a text file. The task is to open the file and extract the content of the void function. In this case it would be "$LIN::Kl_15 = 1;"
This is what I already got:
Masterfunktionsliste = open("C:/.../Masterfunktionsliste_Beispiel.can", "r")
Funktionen = []
Funktionen = Masterfunktionsliste.read()
Funktionen = Funktionen.split('\n')
print(Funktionen)
I receive the following list:
['', '', 'void KL15 ein', '{', '\t$LIN::Kl_15 = 1;', '}', '', 'void Motor ein', '{', '\t$LIN::Motor = 1;', '}', '', '']
And now i want to extract the $LIN::Kl_15 = 1; and the $LIN::Motor = 1; line into variables.

Use the { and } lines to decide what lines to extract:
scope_depth = 0
line_stack = list(reversed(Funktionen))
body_lines = []
while len(line_stack) > 0:
next = line_stack.pop()
if next == '{':
scope_depth = scope_depth + 1
elif next == '}':
scope_depth = scope_depth - 1
else:
# test that we're inside at lest one level of {...} nesting
if scope_depth > 0:
body_lines.append(next)
body_lines should now have values ['$LIN::Kl_15 = 1;', '$LIN::Motor = 1;']

You can loop through the list, search for your variables and save it as dict:
can_file_content = ['', '', 'void KL15 ein', '{', '\t$LIN::Kl_15 = 1;', '}', '', 'void Motor ein', '{', '\t$LIN::Motor = 1;', '}', '', '']
extracted = {}
for line in can_file_content:
if "$LIN" in line: # extract the relevant line
parsed_line = line.replace(";", "").replace("\t", "") # remove ";" and "\t"
variable, value = parsed_line.split("=") # split on "="
extracted[variable.strip()] = value.strip() # remove whitespaces
output is {'$LIN::Kl_15': '1', '$LIN::Motor': '1'}
now you can access your new variables with extracted['$LIN::Motor'] which is 1

Transform a code tokens list into valid string code

I have written code to transform Python code into a list to compute BLEU score:
import re
def tokenize_for_bleu_eval(code):
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
return tokens
Thanks to this snippet my code struct.unpack('h', pS[0:2]) is parsed properly into the list ['struct', '.', 'unpack', '(', 'h', ',', 'p', 'S', '[', '0', ':', '2', ']', ')'].
Initially, I thought I need simply to use the ' '.join(list_of_tokens) but it kills my variable names like this struct . unpack ('h' , p S [ 0 : 2 ] ) and my code is not executable.
I tried to use Regex to stick some variable names but I can't succeed to reverse my function tokenize_for_bleu_eval to find executable code at the end. Is someone get an idea, perhaps without regex which seems to be too complicated here?
EDIT: We can't just remove all spaces between element of the list because there are examples like items = [item for item in container if item.attribute == value] where the result of the backtranslation without space would be itemforiteminaifitem[0]==1 which is not valid.

I am trying to merge the tokens using this script
import re
def tokenize_for_bleu_eval(code):
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
return tokens
def merge_tokens(tokens):
code = ''.join(tokens)
code = code.replace('`', "'")
code = code.replace(',', ", ")
return code
tokenize = tokenize_for_bleu_eval("struct.unpack('h', pS[0:2])")
print(tokenize) # ['struct', '.', 'unpack', '(', '`', 'h', '`', ',', 'p', 'S', '[', '0', ':', '2', ']', ')']
merge_result = merge_tokens(tokenize)
print(merge_result) # struct.unpack('h', pS[0:2])
Edit:
I found this interesting idea to tokenize and merge.
import re
def tokenize_for_bleu_eval(code):
tokens_list = []
codes = code.split(' ')
for i in range(len(codes)):
code = codes[i]
code = re.sub(r'([^A-Za-z0-9_])', r' \1 ', code)
code = re.sub(r'([a-z])([A-Z])', r'\1 \2', code)
code = re.sub(r'\s+', ' ', code)
code = code.replace('"', '`')
code = code.replace('\'', '`')
tokens = [t for t in code.split(' ') if t]
tokens_list.append(tokens)
if i != len(codes) -1:
tokens_list.append([' '])
flatten_list = []
for tokens in tokens_list:
for token in tokens:
flatten_list.append(token)
return flatten_list
def merge_tokens(flatten_list):
code = ''.join(flatten_list)
code = code.replace('`', "'")
return code
test1 ="struct.unpack('h', pS[0:2])"
test2 = "items = [item for item in container if item.attribute == value]"
tokenize = tokenize_for_bleu_eval(test1)
print(tokenize) # ['struct', '.', 'unpack', '(', '`', 'h', '`', ',', ' ', 'p', 'S', '[', '0', ':', '2', ']', ')']
merge_result = merge_tokens(tokenize)
print(merge_result) # struct.unpack('h', pS[0:2])
tokenize = tokenize_for_bleu_eval(test2)
print(tokenize) # ['items', ' ', '=', ' ', '[', 'item', ' ', 'for', ' ', 'item', ' ', 'in', ' ', 'container', ' ', 'if', ' ', 'item', '.', 'attribute', ' ', '=', '=', ' ', 'value', ']']
merge_result = merge_tokens(tokenize)
print(merge_result) # items = [item for item in container if item.attribute == value]
This script will also remember each space from the input

Concatenate some strings in a list

I have a list:
["i", "'", "ll", "get", "you", "in", "their", "call"]
And I want to convert it to:
["i'll", "get", "you", "in", "their", "call"]
So I need to concatenate words before and after apostrophes.
I tried to do it like this:
restored_text = ['i', "'", 'll', 'get', 'you', 'in', 'their', 'call']
restored_text_fixed = []
k = 0
i = 0
while i in (x for x in range(len(restored_text)) if k == 0):
print(i)
print('k', k)
if k > 0:
k = k - 1
if restored_text[i+1] == "'" and i > 0:
restored_text_fixed.append(restored_text[i] + "'" + restored_text[i+2])
k = 2
else:
restored_text_fixed.append(restored_text[i])
i += 1
But it seems that code doesn't work properly.

You can create an iterator from the list and concatenate the current item and the next item from the iterator with the previous item if the current item is an apostrophe:
restored_text = ["i", "'", "ll", "get", "you", "in", "their", "call"]
restored_text_fixed = []
i = iter(restored_text)
for s in i:
if s == "'":
restored_text_fixed[-1] += s + next(i)
else:
restored_text_fixed.append(s)
restored_text_fixed becomes:
["i'll", 'get', 'you', 'in', 'their', 'call']

Something that is a bit more in the flavour of the original post:
restored_text = ['i', "'", 'll', 'get', 'you', 'in', 'their', 'call']
restored_text_fixed = []
i = 0
while i < len(restored_text):
# catch situations where there is no next element, otherwise the
# next if clause will break the script
if i == len(restored_text)-1:
restored_text_fixed.append(restored_text[i])
break
if restored_text[i+1] == "'":
restored_text_fixed.append(restored_text[i] + "'" + restored_text[i+2])
i += 3
else:
restored_text_fixed.append(restored_text[i])
i += 1
print(restored_text_fixed)
Note that the other answers are considered a lot more pythonic, and that this looping over indices can usually be improved. Hopefully this can fix the bugs with your particular use case though

This might not work if you have space (" ") in one of the elements or if you have apostrophe between two spaces (" ` ") in the elements, but if this list is a result of some tokenization process you could use this simple approach -
Just join them back with spaces, replace " ` " with "`" and split back.
a = ["i", "'", "ll", "get", "you", "in", "their", "call"]
" ".join(a).replace(" ' ", "'").split(" ")
> ["i'll", 'get', 'you', 'in', 'their', 'call']
If you do have spaces but you absolutely sure you don't have other character (e.g. $) you can replace spaces with $ before and then replace them back to spaces afterwards.

restored_text = ["i", "'", "ll", "get", "you", "in", "their", "call"]
#preparing the array "restored_text_fixed" to make expected number of elements
j = 0
for item in restored_text:
if item == "'":
j +=1
j = j * 2 # because each apostrophe took 3 elements from array, like ['i', "'", 'll'] total 3 elements, after it become 1 element ['i'll']
restored_text_fixed = ["0" for x in range(len(restored_text)-(j))]
#the processes above to allow us respect the position of elements from old array "restored_text" to new array "restored_text_fixed"
p = 0
for itemPos in range(len(restored_text)):
if itemPos < len(restored_text)-1 and restored_text[itemPos+1] == "'":
restored_text_fixed[p] = restored_text[itemPos]+""+ restored_text[itemPos+1]+""+restored_text[itemPos+2]
p += 1
elif restored_text[itemPos-1] != "'" and restored_text[itemPos] != "'":
restored_text_fixed[p] = restored_text[itemPos]
p += 1
print(restored_text_fixed)
#OUTPUT
#["i'll", 'get', 'you', 'in', 'their', 'call', "get'in"]
#NOTE
#restored_text = ['i', "'", 'll', 'get', 'you', 'in', 'their', 'call', 'get', "'", 'in'] # add more elements to test if can work from any pos. it work

You can use the following line:
>>> l = ["i", "'", "ll", "get", "you", "in", "their", "call"]
>>> "'".join(map(str.strip, ' '.join(l).split("'"))).split()
["i'll", 'get', 'you', 'in', 'their', 'call']
Breaking it down:
l = ' '.join(l) # join words into sentence => "i ' ll get you in their call"
l = l.split("'") # split by quotation mark => ['i ', ' ll get you in their call']
l = map(str.strip, l) # strip off spaces => ['i', 'll get you in their call']
l = "'".join(l) # add back quotation mark => "i'll get you in their call"
l = l.split() # split back into words => ["i'll", 'get', 'you', 'in', 'their', 'call']

how to remove the empty quotes from the list using python?

I have a python script that pre-processes the text before I can make the text analysis. Some of the functions to clean the text are:
to remove strings that are less than two characters.
to tokenize the text
The problem is that the first function returns a list and the second takes the returned list and also returns a list so it becomes list inside list. Like this:
['[', "'الموضوع", "'", ',', "'إجتماع", "'", ',', "'بين", "'", ',',
"'الجنة", "'", ',', "'البحرية", "'", ',', "'الفرعية", "'", ',',]']
where the result must be like this :
['الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية',]
the returned result of the remove stop word :
['ا', 'ل', 'م', 'و', 'ض', 'و', 'ع', ' ', 'إ', 'ج', 'ت', 'م', 'ا', 'ع', ' ', 'ب', 'ي', 'ن', ' ', 'ا', 'ل', 'ج', 'ن', 'ة', ' ', 'ا', 'ل', 'ب', 'ح', 'ر', 'ي', 'ة', ' ', 'ا', 'ل', 'ف', 'ر', 'ع', 'ي', 'ة', ' ', 'و', 'ن', 'ظ', 'ي', 'ر', 'ت', 'ه', 'ا', ' ', 'ف', 'ي', ' ', 'م', 'ب', 'س', 'و', 'ط', ' ', 'ا', 'ل', 'م', 'س', 'ت', 'ن', 'د', ' ', 'ب', 'ر', 'ق', 'ي', 'ة', ' ', 'ر', 'ق', 'م', ' ', '1', '7', '1', 'ع', ' ', 'ت', 'ا', 'ر', 'ي', 'خ', ' ', '1', '2', '1', ]
where the result must be:
['تاريخ', '1212019','الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية',]
code
def remove_1char(text):
tokens = text.split()
tokens = [word for word in tokens if len(word) > 1]
result = str(tokens)
write_file("remove_1char.txt",result)
return result
def tokenize_text(text):
tokens=word_tokenize(text)
write_file("tokenize_text.txt",tokens)
return tokens
def remove_stopwords(tokens):
write_file("tokens_before_remove.txt",tokens)
stop_word_list = set(stopwords.words('arabic'))
clean_tokens= [tok for tok in tokens if tok not in stop_word_list]
write_file("remove_stop_word.txt",clean_tokens)
return clean_tokens
def clean_text(text):
rmws = remove_whiteSpace(text)
rmp = remove_punctuations(rmws)
rmd = remove_diacritics(rmp)
rmrc = remove_repeating_char(rmd)
rm1c = remove_1char(rmrc)
clean_tokens = remove_stopwords(rm1c)
write_file("result.txt",clean_tokens)
return clean_tokens
So how to fix this problem?

Let's open a Python REPL and go through your code.
I assume the first line specifies the input string, to assign it to a variable.
>>> l = ['الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية',]
>>> l
['الموضوع', 'إجتماع', 'بين', 'الجنة', 'البحرية', 'الفرعية']
You didn't specify which function is called, but I assume you first call the function remove_1char for each input string. We will call the lines of the function one after another an view the results for the first item of the list, which we will call text.
>>> text = l[0]
>>> tokens = text.split()
>>> tokens
>>> ['الموضوع']
Since every word in the input sequence consists of one word, the output is as expected.
>>> tokens = [word for word in tokens if len(word) > 1]
>>> tokens
['الموضوع']
And all words have more then 1 character. Also as expected.
>>> result = str(tokens)
>>> result
"['الموضوع']"
>>>
In this line the string representation of the list is assigned to result. That's probably not what you want. I think you want to connect the tokens to a single string. This can be done with the join function.
>>> result = ' '.join(tokens)
>>> result
'الموضوع'
>>>

PYTHON How to count letters in words without special characters

I have a code that counts letters in words excluding special characters at the end. I just can't figure out a way to get it to exclude special character at the beginning also.
My code so far:
inFile = open( 'p.txt', "r" ).readlines()
myResults = []
for i in range( 20 ):
myResults.append( 0 )
mySpecialList = [ '-', '+', '#', '#', '!', '(', ')', '?', '.', ',', ':', ';', '"', "'", '`' ]
for line in inFile:
words = str.split( line )
for word in words:
if word not in mySpecialList:
if word[ -1 ] not in mySpecialList :
myResults[ len( word ) ] += 1
else :
myResults[ len( word ) - 1 ] += 1
print( myResults )

Here is some simple code to count all the alpha numeric letters of a single word.
word = "Hello World!"
count = 0
for c in word:
if c.isalnum():
count +1
print( count )
If you wanted to use your special characters you could adapt the code to look like
mySpecialList = ['*', '!']
word = "Hello World!"
count = 0
for c in word:
if c not in mySpecialList:
count +1
print( count )

You can use regular expressions, try it!
For example you can split string and after findall you have a list with all words.
import re
string = "Hello World, Hi + Say"
print(re.findall(r"[\w']+", string))

def reverseword(user_input):
words=str(user_input).split(" ")
newWords = [word[::-1] for word in words]
newSentence = " ".join(newWords)
return newSentence
if __name__ == "__main__":
while True:
ispresent=0
splcharlist=['-', '+', '#', '#', '!', '(', ')', '?', '.', ',', ':', ';', '"', "'", '`'," "]
user_input=input("Enter the input:")
print(len(user_input))
ccount=0
new_input=""
ch_count=0
if len(user_input)>100:
for eletter in user_input:
if eletter not in splcharlist:
ccount=ccount+1
ch_count=ch_count+1
if ccount>100:
break
new_input=user_input[:100]
else:
new_input=user_input
print("This is for your input:",user_input)
print("input with limit :"+str(new_input))
print(len(new_input))
print("The Reverse lists is: ",reverseword(new_input))
if "stop" in user_input:
break

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Remove stopwords from sentences - python

I think it's easier to use list comprehension (or generator expression as I do here): ' '.join(w for w in text.split() if w not in stop_words)

Related

How to extract a specific line out of a text file

Transform a code tokens list into valid string code

Concatenate some strings in a list

how to remove the empty quotes from the list using python?

PYTHON How to count letters in words without special characters

Categories

Resources