Reading each line and writing to the end of it - python

I am new to python and if I am asking a very simple question, please excuse.
I am trying to read each line from a text file and predict the sentiment of each line and write the output to the end of the text file. For that I am trying to append data to the end of the line.
My text file looks like below :
I am awesome.
I am terrible.
I am bad.
What I am trying to achieve is below :
I am awesome. - Positive
I am terrible. - Negative
I am bad. - Negative
When I run the code, the file is being saved as empty. Please help.
My code is as below :
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
def word_feats(words):
return dict([(word, True) for word in words])
positive_vocab = ['awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)']
negative_vocab = ['bad', 'terrible', 'useless', 'hate', ':(']
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
train_set = negative_features + positive_features
classifier = NaiveBayesClassifier.train(train_set)
# Predict
neg = 0
pos = 0
f = open("test.txt", "r")
for sentence in f.readlines():
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
f.write(' negative')
if classResult == 'pos':
f.write(' positive')
f.close()

You can't write to a file that is open in 'r' mode - that mode is for reading.
My suggestion is to open the file for reading, and open a second file and write out to that. So something like:
f = open("test.txt", "r")
out_file = open("output.txt", "w")
for sentence in f.readlines():
orig = sentence
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
classResult = classifier.classify(word_feats(word))
if classResult == 'neg':
out_file.write(orig + ' negative')
if classResult == 'pos':
out_file.write(orig + ' positive')
f.close()
out_file.close()

You are opening the file in read mode. You would need to open the file in write.
f = open('test.txt', 'w')

Related

Removing characters in a file and also making it neat

I want to basically remove all the characters in delete list from the file (Line 11 to 15). What would be the neatest way to delete the words without making the code not neat. I am not sure whether to open the file again here which I know would not be the right way but I can't think of a different solution. Any help would be appreciated.
from os import write
import re
def readText():
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt') as f:
print(f.read())
def longestWord():
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt', 'r+') as f:
users_text = f.read()
#I want to basically remove all the char in delete list from the file. What would be the neatest way to delete the words without making the code not neat. I am not sure wether to open the file again here and re write it or what!
deleteList = ['!','£','$','%','^','&','*','()','_','+']
for line in f:
for word in deleteList:
line = line.replace(word, '')
longest = max(users_text.split(), key=len)
count_longest = str(len(longest))
print('The longest word in the file is: ' + long)
print('Thats a total of '+count_longest+' letters!')
def writeWord():
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt', 'w') as f:
users_text = input('Enter your desired text to continue. \n: ')
f.write(users_text)
f.close()
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt', 'r') as file:
print(file.read())
longestWord()
Had to re work it and implement it in a different def. Need to add relative paths and will be alot cleaner aswell.
from os import write
import re
def longestWord():
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt', 'r+') as f:
users_text = f.read()
longest = max(users_text.split(), key=len)
count_longest = str(len(longest))
print('The longest word in the file is: ' + longest)
print('Thats a total of '+count_longest+' letters!')
def writeWord():
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt', 'w') as f:
users_text = input('Enter your desired text to continue. \n: ')
cleanText = re.sub('[^a-zA-Z0-9 \n\.]', ' ', users_text)
f.write(cleanText)
with open(r'C:\Users\maxth\Desktop\TextCounter\Text.txt', 'r') as clean:
print('\nRemoved any illegal characters. Here is your text:\n\n' + cleanText + '\n')
f.close()
while True:
print("""
Welcome to Skies word text counter!
====================================================
""")
writeWord()
longestWord()
userDecide = input("""
====================================================
Would you like to enter new text and repeat?
Type 'yes' to continue else program will terminate.
====================================================
: """)
if not userDecide.lower == 'yes':
print('Application closing...')
exit()

searching for sentences that are a question in a file

f = open("data.txt", "rt")
lines = f.read()
#word = line.split()
for line in lines:
if line == 'how' or line == 'what' or line == 'where':
print(lines, "Yes")
else:
print(lines, "No")
f.close()
I am trying to read a file and look for sentences that have how, what, where, etc. Basically sentences that are a question. And printing the sentences along with a Yes or No accordingly.
Format of Input file:
how are you
it's 7 o'clock
where is your food
Format of Output file:
how are you Yes
it's 7 o'clock No
where is your food Yes
But my code is giving no output.
The line if line == 'how' or line == 'what' or line == 'where': suggests to me that you might want to use the logic of keyword any():
f = open("data.txt", "rt")
lines = f.readlines()
f.close()
with open('data_out.txt', 'w') as f:
for line in lines:
if any(question_word in line for question_word in ['how', 'what', 'where']):
output = '{} {}\n'.format(line.strip('\n'), "Yes")
print(output)
f.write(output)
else:
output = '{} {}\n'.format(line.strip('\n'), "No")
print(output)
f.write(output)
how are you Yes
it's 7 o'clock No
where is your food Yes

Write to csv from modified csv files with python code

I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()

JSONDecodeError: Extra data: line 1 column 884 (char 883)

I want to tokenize and sort reviews by keywords, but there is a problem with opening json and programs throw an error: JSONDecodeError: Extra data: line 1 column 884 (char 883).
The files test2.json and keywords.txt are here:
https://github.com/SilverYar/TransportDataMiner
Here is my code:
import nltk
from nltk.stem.snowball import RussianStemmer
from nltk.corpus import stopwords
import nltk, string, json
st = RussianStemmer()
def tokenize_me(file_text):
#applying nltk tokenization
tokens = nltk.word_tokenize(file_text)
#deleting punctuation symbols
tokens = [i for i in tokens if (i not in string.punctuation)]
#deleting stop_words
stop_words = stopwords.words('russian')
stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
tokens = [i for i in tokens if (i not in stop_words)]
#cleaning words
tokens = [i.replace('«', '').replace('»', '') for i in tokens]
return tokens
with open('C:\\Creme\\token\\keywords.txt') as fin:
ww = fin.read().split(', ')
key_words = list(set([st.stem(w) for w in ww]))
with open('C:\\Creme\\token\\test2.json') as fin:
text = json.load(fin)
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')
Help me find the error and fix the code.
You may not do this, because it leads to malformed JSON file:
for dd in text:
if tt:
json.dump(dd, fout) # <<-- cannot do this in the loop
fout.write('\n')
Basically it should be written all at once, with a single dump() or dumps() call.
ok, you have to make the large list first, then output it to the file:
bad_words_list = []
for dd in text:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
bad_words_list.append( dd )
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
json.dump( bad_words_list, fout )
I decided a little differently - read in the file and format the string into correct json formats:
with open('C:\\Creme\\token\\test2.json', 'r', encoding='utf8') as fin:
data = fin.read()
formated_text = data.replace('}{', '},{')
text = json.loads(f'[{formated_text}]')
with open('C:\\Creme\\token\\bad.json', 'a', encoding='utf8') as fout:
for dd in text:
#for d in dd:
words = tokenize_me(dd['description'])
split_text = list(set([st.stem(word) for word in words]))
#break
tt = list(filter(lambda w: w in key_words, split_text))
if tt:
json.dump(dd, fout)
fout.write('\n')

How to delete specific words from a sentence in text file?

I have two text files. The 1st file contains English sentences and 2nd file contains a number of English words (vocabulary). I want to remove those words from the sentences in the 1st file which are not present in the vocabulary and then to save the processed text back into the 1st file.
I wrote the code from which I am able to get those sentences which contains the words that are not available in our 2nd file (vocabulary).
Here is my code:
s = open('eng.txt').readlines()
for i in s:
print(i)
for word in i.split(' '):
print(word)
if word in open("vocab30000.txt").read():
print("Word exist in vocab")
else:
#print("I:", i)
print("Word does not exist")
#search_in_file_func(i)
print("I:", i)
file1 = open("MyFile.txt","a+")
if i in file1:
print("Sentence already exist")
else:
file1.write(i)
However, I am not able to remove those words.
This should work:
with open('vocab30000.txt') as f:
vocabulary = set(word.strip() for word in f.readlines())
with open('eng.txt', 'r+') as f:
data = [line.strip().split(' ') for line in f.readlines()]
removed = [[word for word in line if word in vocabulary] for line in data]
result = '\n'.join(' '.join(word for word in line) for line in removed)
f.seek(0)
f.write(result)
f.truncate()
#Read the two files
with open('vocab30000.txt') as f:
vocabulary = f.readlines()
with open('eng.txt', 'r+') as f:
eng = f.readlines()
vocab_sentences = [i.split(" ") for i in vocabulary]
eng = [i.split(" ") for i in eng]
cleaned_sentences = []
# loop over the sentences and exclude words in eng
for sent in vocab_sentences:
cleaned_sentences.append(" ".join([i for i in sent if i not in eng]))
#write the file
with open('vocab30000.txt', 'w') as f:
f.writelines(cleaned_sentences)
You can try this code. I tried not to use any loops to save your runtime if you have larger files.
import re
with open('eng.txt', 'r') as f:
s = f.read()
s_copy = s
punctuation = [".","\"",",","-","(",")","[","]"]
pattern = re.compile("\\b("+"|".join(punctuation)+")\\W", re.I)
s_copy = pattern.sub(" ", s_copy)
s_copy = s_copy.replace("\"","")
s_words = s_copy.split(" ")
with open('vocab30000.txt', 'r') as f:
check_words = f.read()
remove_words = list(set(s_words) - set(check_words))
pattern = re.compile("\\b("+"|".join(remove_words[1:])+")\\W", re.I)
pattern.sub("", s)

Categories

Resources