Stopword not removing one word - python

i want to remove 'dan' in filtering process, but didnt work.
here is my code
for row in readCSV:
_word = []
username = row[0]
date = row[1]
text = row[2].lower()
text = re.sub(r'#[A-Za-z0-9_]+','',text)
text = re.sub(r'http\S+', '',text)
text = replaceMultiple(text, ["!","#","#","$","%","^","&","*","(",
")","_","-","+","=","{","}","[","]",
"\\","/",",",".","?","<",">",":",";",
"'",'"',"~","0","1","2","3","4","5","6","7","8","9"], '')
text = text.strip()
nltk_tokens = nltk.word_tokenize(text)
stop_words = set(stopwords.words("indonesian"))
stop_words_new = ['aku','dan','duh','hhhmmm','thn','nih','tgl',
'hai','jazz','bro','broo','msh','']
new_stopwords_list = stop_words.union(stop_words_new)
words in stop_words_new is removed except 'dan'.
why?

The code should not be working because you are joining a set with a list. Try making the stop_words_new a set instead of a list

Related

deleting a specific line from a dataframe python NLP

I am trying to preprocess my data for NLP model. I wrote this code to remove numbers, symbols and hyper links. But now I want to delete every line that has a specific instance of the word 'system'. I don't seem to figure how to do that. df is my dataframe and df['Content'] is where I have the text I want to delete the line from.
for example the text can be :
"system: hi im the line that is meant to be deleted
Leena: this line must not be deleted
system: hi again im the line that is meant to be deleted "
the output should be :
Leena: this line must not be deleted
def CleaningTXT(df):
Allchat=list()
lines=df['Content'].values.tolist()
for text in lines:
text=text.lower()
#remove links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
text = pattern.sub('', text)
#remove session join/leave
pattern = re.compile('new party join session')
text = pattern.sub('', text)
pattern = re.compile('new party leave session')
text = pattern.sub('', text)
#remove sympols
text = re.sub(r"[,.\"!##$%^&*(){}?/;`~:<>+=-]", "", text)
#seperating words
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
#removing numbers
words = [word for word in stripped if word.isalpha()]
words = ' '.join(words)
Allchat.append(words)
return Allchat
hope I understand your request.
try the following:
def CleaningTXT(df):
Allchat=list()
#Added
index_to_drop = df[ df['Content'].str.contains('system')].index
df.drop(index_to_drop, inplace = True)
lines=df['Content'].values.tolist()
for text in lines:
text=text.lower()
#remove links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
text = pattern.sub('', text)
#remove session join/leave
pattern = re.compile('new party join session')
text = pattern.sub('', text)
pattern = re.compile('new party leave session')
text = pattern.sub('', text)
#remove sympols
text = re.sub(r"[,.\"!##$%^&*(){}?/;`~:<>+=-]", "", text)
#seperating words
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
#removing numbers
words = [word for word in stripped if word.isalpha()]
words = ' '.join(words)
Allchat.append(words)
return Allchat

Write to csv from modified csv files with python code

I'm trying to process data(remove hastag, link and #) from CSV files and stored it back to CSV. however the output does not perform well. it is separated with a comma for each character. Can anyone help me to write it to csv in a better way. thank you
import re,string
import csv
def strip_links(text):
link_regex = re.compile('((https?):((//)|(\\\\))+([\w\d:##%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
links = re.findall(link_regex, text)
for link in links:
text = text.replace(link[0], ', ')
return text
def strip_all_entities(text):
entity_prefixes = ['#','#']
for separator in string.punctuation:
if separator not in entity_prefixes :
text = text.replace(separator,' ')
words = []
for word in text.split():
word = word.strip()
if word:
if word[0] not in entity_prefixes:
words.append(word)
return ' '.join(words)
f = open('Test.csv')
csf_f = csv.reader(f)
temp =[]
for row in csf_f:
temp.append(row[0])
temp1 = []
for t in temp:
temp1.append(strip_all_entities(strip_links(t)))
for i in range(0, len(temp1)):
with open('MYOUTPUT.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerows(temp1)
f.close()

only get the last result of the xml file

here is my code, I try to use the print function to check and I tag what I have found next to the code using#
def file(entry):
file_name = str(entry)
if file_name.endswith('.xml'):
tree = ET.parse(file_name)
root = tree.getroot()
for i in range(len(root)):
in_text = str(root[i][5].text).lower()
print(in_text)# here I still get all data
elif file_name.endswith('.json'):
with open(file_name) as f:
j_text = json.load(f)
in_text = (j_text['text']).lower()
else:
root_error = tk.Tk()
root_error.title('Error !')
canvas_error = tk.Canvas(root_error, height=10, width=100 )
canvas_error.pack()
label_error = tk.Label(root_error, text= 'file type dont support')
label_error.pack()
root_error.mainloop()
remove_digits = str.maketrans('', '', digits)
res = in_text.translate(remove_digits)
print(res)# here I get only the last one
token_text = sent_tokenize(res)
sent_string = ('\n'.join(token_text))
removed_pun = str(sent_string).translate(str.maketrans('', '', string.punctuation))
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(str(removed_pun))
result = [i for i in tokens if not i in stop_words]
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in result]
lemmatizer = WordNetLemmatizer()
final_text = ' '.join([lemmatizer.lemmatize(w) for w in stemmed])
lower_label_out['text'] = final_text
but when I use the code only like this
tree = ET.parse('books.xml')
root = tree.getroot()
for i in range(len(root)):
print(root[i][5].text)
I get all the data, I don't know why I only get the last data, how can I fix it
As written in the comment, your problem is that you overwrite the label['text'] value in each iteration. With the new indentation, you just shifted the problem from the out_text variable to the label['text'] variable. If you want to get a list of all out_texts, I'd suggest you to do the following.
out_text = []
for i in range(len(root)):
# in each iteration, append the new string to the list
out_text.append(str(root[i][0].text)
label_out['text'] = out_text
In each iteration, the value of str(root[i][0].text) is appended to the list, and finally assigned to the label_out['text'] value.
However, I'd suggest you to look into how for loops work in python, as you could write the same statement as follows:
out_text = []
for ro in root:
out_text.append(str(ro[0]).text)
label_out['text'] = out_text
The reason why the print() statement works is that you put it into the for loop, so each time the code passes there, the current value is printed to the screen.
The last line in your for lop is mis-indented, so it only prints the last element.
Try changing it to:
for i in range(len(root)):
out_text = str(root[i][0].text)
label_out['text'] = out_text #note the new indentation
and see if it works.

Use of For loop in processing directory contents in Python

I am attempting to loop through a series of text files in a directory, looking for occurences of certain types of words, and prefixing each found word with a user defined tag. My code is as follows.
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
My problem is that although code works on the last file in the directory it is not working on the previous files (1 out of 10 in this) I've tried a second For loop above the file write out statements but that is not working at all. Can anyone explain what I'm doing wrong here?
regards
My speculation is your code is only showing the last file because it's
not indented properly to have all relevant code within the for loop.
Try with this indentation:
ACC_Tagged_Test = 'C:/ACC_Tag_Test'
for filename in glob.glob(os.path.join(ACC_Tagged_Test, '*.txt')):
with open(filename) as f:
data = f.read()
data = data.lower()
modals = {"could":1, "would":1, "should":1, "can":1, "may":1, "might":1}
personal_attribute = {"believes":1, "guess":1, "surmise":1, "considers":1,
"presume":1, "speculate":1, "postulate":1, "surmised":1, "assume":1}
approx_adapt = {"broadly":1, "mainly":1, "mostly":1, "loosely":1,
"generally":1, "usually":1,"typically":1, "regularly":1, "widely":1}
plaus_shields = {"wonder":1, "suspect":1, "theorize":1, "hypothesize":1,
"cogitate":1, "contemplate":1, "deliberate":1}
format_modal = "<555>{} ".format
format_attribute = "<666>{} ".format
format_app_adaptor = "<777>{} ".format
format_plaus_shield = "<888>{} ".format
data = " ".join(format_modal(word) if word in modals else word for word in data.split())
data = " ".join(format_attribute(word) if word in personal_attribute else word for word in data.split())
data = " ".join(format_app_adaptor(word) if word in approx_adapt else word for word in data.split())
data = " ".join(format_plaus_shield(word) if word in plaus_shields else word for word in data.split())
with open (filename, "w") as f:
f.write(str(data))
print(data) # This is just added in order to check on screen all files
# Are being processed.
Assuming all of your code is supposed to be in your for loop. You are overriding your text file, therefore it looks like only your last run is working:
#this overrides the file
with open(filename, "w") as fh:
fh.write(str(data))
change to:
#this append to the file
with open(filename, "a") as fh:
fh.write(str(data))
This will append to your text file and will not override previous added data with the data from the last loop.

missing some text when parsing article content with lxml, what is the issue?

I am using the following code to parse an article from a french news site. When getting all the paragraphs, i keep missing some text. why is that?
Here is my code: the code with the XX is the most relevant the other parts is just me putting it in my own structure for use.
def getWordList(sent,wordList):
listOfWords = list((sent).split())
for i in listOfWords:
i = i.replace("."," ")
i = i.replace(","," ")
i = i.replace('\"'," ")
valids = re.sub(r"[^A-Za-z]+", '', i)
if(len(i) > 3 and (i.lower() not in stopWords) and i.isnumeric() !=
True and valids):
wordList[valids] = {}
wordList[valids]["definition"] = ""
wordList[valids]["status"] = ""
def parse(link):
page = requests.get(link)
tree = html.fromstring(page.content)
XXword = tree.xpath('//*[#class="article__content old__article-content-single"]')
articleContent = {}
articleContent["words"] = {}
articleContent["language"] = "French";
wordList = articleContent["words"]
contentList = []
XXpTag = word[0].xpath('//*')
pText = {}
for x in range(len(pTag)):
#print(pTag[x].get("class"))
if(pTag[x].text != None):
if(pTag[x].tail != None):
print("tail")
XXtext = pTag[x].text + pTag[x].tail
else:
print("no tail")
XXtext = pTag[x].text
XXif(pTag[x].get("class") == "article__paragraph "):
print(pTag[x].get("class"))
print(text)
getWordList(text,wordList)
pText[text] = {}
pText[text]["status"] = ""
pText[text]["type"] = "p"
XXelif(pTag[x].get("class") == "article__sub-title"):
print(pTag[x].get("class"))
getWordList(text,wordList)
pText[text] = {}
pText[text]["status"] = ""
pText[text]["type"] = "h2"
here is an example article link: https://www.lemonde.fr/economie/article/2019/05/23/vivendi-chercherait-a-ceder-universal-music-group-au-chinois-tencent_5466130_3234.html
I am successfully getting all the highlighted text but the rest is missing,not the text in the middle i am successfully avoiding that. I just want the text in between which is not being included.
Thank you for your help!!
You're trying to get the content of tags containing other tags. For example, there are <em> emphasized text tags in the <p> paragraph tags.
Use the text_content() method instead of text to get the full content of your paragraphs:
text = pTag[x].text_content() + pTag[x].tail
and
text = pTag[x].text_content()

Categories

Resources