deleting a specific line from a dataframe python NLP - python

I am trying to preprocess my data for NLP model. I wrote this code to remove numbers, symbols and hyper links. But now I want to delete every line that has a specific instance of the word 'system'. I don't seem to figure how to do that. df is my dataframe and df['Content'] is where I have the text I want to delete the line from.
for example the text can be :
"system: hi im the line that is meant to be deleted
Leena: this line must not be deleted
system: hi again im the line that is meant to be deleted "
the output should be :
Leena: this line must not be deleted
def CleaningTXT(df):
Allchat=list()
lines=df['Content'].values.tolist()
for text in lines:
text=text.lower()
#remove links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
text = pattern.sub('', text)
#remove session join/leave
pattern = re.compile('new party join session')
text = pattern.sub('', text)
pattern = re.compile('new party leave session')
text = pattern.sub('', text)
#remove sympols
text = re.sub(r"[,.\"!##$%^&*(){}?/;`~:<>+=-]", "", text)
#seperating words
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
#removing numbers
words = [word for word in stripped if word.isalpha()]
words = ' '.join(words)
Allchat.append(words)
return Allchat

hope I understand your request.
try the following:
def CleaningTXT(df):
Allchat=list()
#Added
index_to_drop = df[ df['Content'].str.contains('system')].index
df.drop(index_to_drop, inplace = True)
lines=df['Content'].values.tolist()
for text in lines:
text=text.lower()
#remove links
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
text = pattern.sub('', text)
#remove session join/leave
pattern = re.compile('new party join session')
text = pattern.sub('', text)
pattern = re.compile('new party leave session')
text = pattern.sub('', text)
#remove sympols
text = re.sub(r"[,.\"!##$%^&*(){}?/;`~:<>+=-]", "", text)
#seperating words
tokens = word_tokenize(text)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
#removing numbers
words = [word for word in stripped if word.isalpha()]
words = ' '.join(words)
Allchat.append(words)
return Allchat

Related

error "argument of type 'module' is not iterable'

def clean_text (text):
'''Text Preprocessing '''
# Convert words to lower case
text = text.lower()
#Expand contractions
if True:
text = text.split()
new_text= []
for word in text:
if word in contractions:
new_text.append(contractions [word])
else:
new_text.append(word)
text = "".join(new_text)
# Format words and remove unwanted characters
text = re.sub(r'https?:\/\/[\r\n],"[\r\n]"', '', text, flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', text)
text = re.sub(r'&', '', text)
text- re.sub(r'[_"\-;%()|+&=*%.,!?:#$#\[\]/]',' ', text)
text = re.sub(r'<br />', ' ', text)
text = re.sub(r'\'', ' ', text)
#remove stopwords
if remove_stopwords:
text = text.split()
stops = set(stopwords.words ("english"))
text = [w for w in text if not w in stops]
text = "" .join(text)
# Tokenize each word
text = nltk.WordPunctTokenizer().tokenize(text)
text = nltk.TreebankWordTokenizer().tokenize(text)
text = nltk.WordPunctTokenizer().tokenize(text)
#Lemmatize each token
lemm = nltk.stem.WordNetLemmatizer()
text = list(map(lambda word:list(map(lemm.lemmatize, word)), text))
return text
when I run the above code it runs without an issue.
but when I run the below code using the above def it shows "argument of type 'module' is not iterable'
sentences_train = list(map(clean_text, sentences_train))
I have attached an image of the error for reference.
I have tried different ways to solve this but it make the error worse. if someone can help me with this and tell me why this occurs it would be really nice. thank you!
any suggestions will be considered.
The error seems to be caused by the contractions. I don't know how you created the contractions but keep in mind that you can only do if word in contractions: if the contractions is a list of words like
contractions = ["abc", "xyz", "123"].
There seems to be a miss match with your import of contradictions and what you actually want to iterate over to check if word is in something (Without a complete example difficult to say). See example below for the same error. The solution would be to use contradictions.[something iterable from this module]
import os
sent = ['1', '2', '3']
something_it = ['2']
def return_str(s):
# if s in something_it: # <- something like this
if s in os:
return 'x'
else:
return s
another_list = list(map(return_str, sent))
print(another_list)

How to strip a word from a Python list when it's attached to another word?

I'm trying to remove html tag from a string, so i tried the following:
def cleaner(raw):
stopwords = ['<ul>', '</ul>', '<li>', '</li>']
querywords = raw.split()
resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)
return result
The problem with this code is that it fails to remove the following word where a tag is attached to a word: .. <li>Drive ... Is there any way to remove such cases as well?
Simple example. Note this requires pip install beautifulsoup4:
from bs4 import BeautifulSoup
my_html="""<div> This is my list:
<ul>
<li>Coffee</li>
<li>Tea</li>
<li>Milk</li>
</ul>
</div>"""
soup = BeautifulSoup(my_html, 'html.parser')
print(soup.text)
Output:
This is my list:
Coffee
Tea
Milk
This removes all tags:
import re
query='<HTML><ul>list</ul>more text<li>list item</li>more html text</html>'
def cleaner(raw):
stopwords = ['<ul>', '</ul>', '<li>', '</li>']
result = raw
result = re.sub(r'<.*?>', '', raw) # or use ' ' if you need spaces
return result # OR:
return re.sub(r' +', ' ', result) # remove multiple spaces when needed
print(cleaner(query))
> listmore textlist itemmore html text
This removes only the tags in your list:
query='<HTML><ul>asfa</ul>lsfj;aj;lf<li>ahsdfl</li>'
def cleaner(raw):
stopwords = ['<ul>', '</ul>', '<li>', '</li>']
result = raw
for stopword in stopwords:
result = result.replace(stopword, '')
return result
print(cleaner(query))
> <HTML>listmore textlist itemmore html text</html>
You can try this for removing every html tag using re
def cleaner(raw):
raw = re.sub(r"<.*?>", " ", raw)
return raw.strip()
raw = "<li>Test</li><ul>Drive</ul><h3>Title</h3><body>body of the text</body>"
output:
'Test Drive Title body of the text'
Try this,
import re
def cleaner(raw):
stopwords = ['<ul>', '</ul>', '<li>', '</li>']
replace_ = re.compile("|".join(stopwords))
return " ".join([replace_.sub("", word) for word in raw.split()])
print(cleaner("<ul>test</ul> <li>Drive<li>")) # test Drive
Like this:
resultwords = [word.replace(a,'') for a in stopwords for word in querywords]
Altogether:
def cleaner(raw):
stopwords = ['<ul>', '</ul>', '<li>', '</li>']
querywords = raw.split()
resultwords = [word.replace(a,'') for a in stopwords for word in querywords]
result = ' '.join(resultwords)
return result
If your issue is while your querywords are prefix with html tags then, I think you can iterate the querywords and check each word is not starting with with any one of the stopword.
temp=[]
for each_word in querywords:
for each_stop in stopwords:
if not each_word.startswith(each_stop):
temp.append(each_word)
This may not be efficient. We can replace with list comprehension.
-Siva
This may help :).
def cleaner(raw):
stopwords = ['<ul>', '</ul>', '<li>', '</li>']
result = ""
for word in raw.split():
for tag in stopwords:
if tag in word:
word = word.replace(tag, "")
if(word != ""):
result += word +" "
return result.rstrip()

NLP clustering documents

I am using HDBSCAN algorithm to create clusters from the documents I have. But to create a vector matrix from the words, I am using tf-idf algorithm and want to use GloVe or Word2vec(because tf-idf based on BoW, so it can`t capture semantics).
Which method can I use - GloV, Word2vec or any other methods that will be appropriated for text clusterization?
And how I can implement it?
Any help will be highly appreciated!
nltk.download('stopwords')
title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False
# pre-process data
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
duplicates = 0
for row in csvreader:
# removes the characters specified
line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '',
line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = (re.sub(
"(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
line = filter(lambda x: x in string.printable,
line) # filter non-ascii characers
if line not in synopses:
synopses.append(line)
title.append(row[2])
else:
duplicates += 1
print("Removed " + str(duplicates) + " rows")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for**strong text** token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
# for each item in 'synopses', tokenize/stem
allwords_stemmed = tokenize_and_stem(i)
# extend the 'totalvocab_stemmed' list
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame(
{'words': totalvocab_tokenized}, index=totalvocab_stemmed)
# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()
c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()

Stopword not removing one word

i want to remove 'dan' in filtering process, but didnt work.
here is my code
for row in readCSV:
_word = []
username = row[0]
date = row[1]
text = row[2].lower()
text = re.sub(r'#[A-Za-z0-9_]+','',text)
text = re.sub(r'http\S+', '',text)
text = replaceMultiple(text, ["!","#","#","$","%","^","&","*","(",
")","_","-","+","=","{","}","[","]",
"\\","/",",",".","?","<",">",":",";",
"'",'"',"~","0","1","2","3","4","5","6","7","8","9"], '')
text = text.strip()
nltk_tokens = nltk.word_tokenize(text)
stop_words = set(stopwords.words("indonesian"))
stop_words_new = ['aku','dan','duh','hhhmmm','thn','nih','tgl',
'hai','jazz','bro','broo','msh','']
new_stopwords_list = stop_words.union(stop_words_new)
words in stop_words_new is removed except 'dan'.
why?
The code should not be working because you are joining a set with a list. Try making the stop_words_new a set instead of a list

How to find and match each elements of a list on each sentences?

I have a file including some sentences. I used polyglot for Named Entity Recognition and stored all detected entities in a list. Now I want to check if in each sentence any or pair of entities exist, show that for me.
Here what I did:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
test = Text(input_file, hint_language_code='fa')
list_entity = []
for sent in test.sentences:
#print(sent[:10], "\n")
for entity in test.entities:
list_entity.append(entity)
for i in range(len(test)):
m = test.entities[i]
n = test.words[m.start: m.end] # it shows only word not tag
if str(n).split('.')[-1] in test: # if each entities exist in each sentence
print(n)
It gives me an empty list.
Input:
sentence1: Bill Gate is the founder of Microsoft.
sentence2: Trump is the president of USA.
Expected output:
Bill Gate, Microsoft
Trump, USA
Output of list_entity:
I-PER(['Trump']), I-LOC(['USA'])
How to check if I-PER(['Trump']), I-LOC(['USA']) is in first sentence?
For starters you were adding the whole text file input to the entities list.
entities can only be called by each sentence in the polyglot object.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
#print(entity)
list_entity.append(entity)
print(list_entity)
Now you don't have an empty list.
As for your problem with identifying the identity terms,
I have not found a way to generate an entity by hand, so the following simply checks if there are entities with the same term. A Chunk can have multiple strings inside, so we can go through them iteratively.
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='ar')
def check_sentence(entities_list, sentence): ## Check if string terms
for term in entities_list: ## are in any of the entities
## Compare each Chunk in the list to each Chunk
## object in the sentence and see if there's any matches.
if any(any(entityTerm == term for entityTerm in entityObject)
for entityObject in sentence.entities):
pass
else:
return False
return True
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
entity_terms = ["Bill",
"Gates"]
if check_sentence(entity_terms, sentence):
print("Entity Terms " + str(entity_terms) +
" are in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain terms" + str(entity_terms ))
Once you find a way to generate arbitrary entities all you'll have to do is stop popping the term from the sentence checker so you can do type comparison as well.
If you just want to match the list of entities in the file against a specific sentence, then this should do the trick:
from polyglot.text import Text
file = open('input_raw.txt', 'r')
input_file = file.read()
file = Text(input_file, hint_language_code='fa')
def return_match(entities_list, sentence): ## Check if and which chunks
matches = [] ## are in the sentence
for term in entities_list:
## Check each list in each Chunk object
## and see if there's any matches.
for entity in sentence.entities:
if entity == term:
for word in entity:
matches.append(word)
return matches
def return_list_of_entities(file):
list_entity = []
for sentence in file.sentences:
for entity in sentence.entities:
list_entity.append(entity)
return list_entity
list_entity = return_list_of_entities(file)
sentence_number = 1 # Which sentence to check
sentence = file.sentences[sentence_number]
match = return_match(list_entity, sentence)
if match:
print("Entity Term " + str(match) +
" is in the sentence. '" + str(sentence)+ "'")
else:
print("Sentence '" + str(sentence) +
"' doesn't contain any of the terms" + str(list_entity))

Categories

Resources