Searching keywords in a text file with a dictionary with python - python

I am trying to search a lot of keywords in a textfile and return integers/floats that come after the keyword.
I think it's possible using a dictionary where the keys are the keywords that are in the text file and the values are functions that return the following value.
import re
def store_text():
with open("path_to_file.txt", 'r') as f:
text = f.readlines()
return text
abc = store_text()
def search():
for index, line in enumerate(abc):
if "His age is:" in line:
return int(re.search(r"\d+", line).group())
dictionary = {
"His age is:": print(search())
}
The code returns the value I search in the text file but in search() I want to get rid of typing the keyword again, because its already in the dictionary.
Later on I want to store the values found in an excel file.

If you have the keywords ready to be in a list, the following approach can help.
import re
from multiprocessing import Pool
search_kwrds = ["His age is:", "His name is:"] # add more keywords if you need.
search_regex = "|".join(search_kwrds)
def read_search_text():
with open("path_to_file.txt", 'r') as f:
text = f.readlines()
return text
def search(search_line):
search_res = re.search(search_regex, search_line)
if search_res:
kwrd_found = search_res.group(0)
if kwrd_found:
suffix_val = int(re.search(r"\d+", search_line).group())
return {kwrd_found: suffix_val }
return {}
if __name__ == '__main__':
search_lines = read_search_text()
p = Pool(processes=1) # increase, if you want a faster search
s_res = p.map(search,search_lines)
search_results ={kwrd: suffix for d in s_res for kwrd, suffix in d.items()}
print(search_results)
You can add more keywords to the list and search for them. This focuses on searches where you will have a single keyword on a given line and keywords are not repeating in further lines.

You can put up your keywords that you need to search in a list. This way you end up specifying your input keywords just once in your program. Also, I've modified your program to make it a bit efficient. Explanation given in comments.
import re
import csv
list_of_keywords = ["His age is:","His number is:","Your_Keyword3"] # You can add more keywords to find and match to this list
def store_text():
with open("/Users/karthick/Downloads/sample.txt", 'r') as f:
text = f.readlines()
return text
abc = store_text()
def search(input_file):
# Initialize an empty dictionary to store the extracted values
dictionary = dict()
#Iterate through lines of textfile
for line in input_file:
#FOr every line in text file, iterate through the keywords to check if any keyword is present in the line
for keyword in list_of_keywords:
if keyword in line:
#If any matching keyword is present, append the dictionary with new values
dictionary.update({keyword : re.search(r"\d+", line).group()})
return dictionary
#Call the above function with input
output_dict = search(abc)
For storing the output values in an Excel csv:
#Write the extracted dictionary to an Excel csv file
with open('mycsvfile.csv','w') as f: #Specify the path of your output csv file here
w = csv.writer(f)
w.writerows(output_dict.items())

Related

I get an error about wrong dictionary update sequence length when trying to read lines from txt file into a dictionary

I'm trying to loop through multiple lines and add that into a dictionary, then a dataframe.
I've had many attempts but no solution yet.
I have a txt file with multiple lines like this for example, and I'm trying to iterate through each line, add it to a dictionary and then append the dictionary to a dataframe.
So textfile for example would go from here:
ABC=123, DEF="456",
ABC="789", DEF="101112"
I would like this be added to a dictionary like this (on the first loop, for the first line):
{ABC:123,DEF=456}
and then appended to a df like this
ABC DEF
0 123 456
1 789 101112
So far I have tried this, this only works for one line in the text file, when I add a new line, I get this error:
dictionary update sequence element #6 has length 3; 2 is required
with open("file.txt", "r") as f:
s = f.read().strip()
dictionary = dict(subString.split("=") for subString in s.split(","))
dataframe = dataframe.append(dictionary, ignore_index=True)
dataframe
One suggestion is to parse each line with regex, and then insert the matches (if found) into the dictionary. You can change the regex pattern as needed, but this one matches words on the left side of = with numbers on the right which start with ' or ".
import re
import pandas as pd
pattern = r'(\w+)=[\'\"]?(\d+)'
str_dict = {}
with open('file.txt') as f:
for line in f:
for key, val in re.findall(pattern, line):
str_dict.setdefault(key, []).append(int(val))
df = pd.DataFrame(str_dict)
This is how I chose the regex pattern
This also works in the scenario of a huge text file with many different strings:
import re
file = open('event.txt', 'r').readlines()
for group in file:
output1 = group.replace('Event time', 'Event_time')
words = re.findall(r'".*?"', str(output1))
for word in words:
text = str(output1).replace(word, word.replace(" ", "_"))
output2 = text.strip().split(' ')
for section in output2:
key,val = section.strip().split('=')
data_dict[key.strip()] = val.strip()
df = df.append(data_dict, ignore_index=True)
print(df)

How to create a range to search for in a text file using two strings inside the text file and search in between those two strings?

Currently this is what i have, once i found the word fail , it needs to search above the fail string and display up until it hits another string i set. and then inquire user for the string to search within that area. Am i correct in creating a new text file to store the lines that contains the string from user ?
from typing import Any
from collections import Counter
errorList = []
with open('File1.txt', 'r') as f:
data = f.readlines()
for line in data:
if line.__contains__('Fail '):
errorList.append(line)
errorList = [i[30:56] for i in errorList]
print("Failed are = ", errorList)
string = input("What string do you like to search for in this test case? ")
f = open('File1.txt')
f1 = open('StringSearch.txt', 'a')
for line in f.readlines():
if string in line:
f1.write(line)

Python 3: Trying to change a dictionary to a string and write to file, but it still won't write to file?

I'm trying to take text files, count the word usage of each word as key-value pairs in dictionaries, and write each dictionary to their own file. Then, I want to add all of the dictionaries together into one master dictionary, and then write that to its own text file. When I run the program, I keep getting a TypeError with the save_the_dictionary function, since it's getting passed a dictionary instead of a string; however, I thought that my save_the_dictionary function changes each key-value pair into strings before they are written to the file, but that doesn't seem to be the case. Any help with this would be greatly appreciate. Here is my code:
import os
from nltk.tokenize import sent_tokenize, word_tokenize
class Document:
def tokenize(self, text):
dictionary = {}
for line in text:
all_words = line.upper()
words = word_tokenize(all_words)
punctuation = '''!()-[]{};:'"\,<>./?##$%^&*_~'''
cleaned_words = []
for word in words:
if word not in punctuation:
cleaned_words.append(word)
for word in cleaned_words:
if word in dictionary:
dictionary[word] += 1
else:
dictionary[word] = 1
return dictionary
def save_the_dictionary(self, dictionary, filename): #This save function writes a new file, and turns each key and its corresponding value into strings and writes them into a text file-
newfile = open(filename, "w") #, it also adds formatting by tabbing over after the key, writing the value, and then making a new line. Then it closes the file.
for key, value in dictionary.items():
newfile.write(str(key) + "/t" + str(value) + "/n")
file.close()
# The main idea of this method is that it first converts all the text to uppercase strips all of the formatting from the file that it is reading, then it splits the text into a list,-
# using both whitespace and the characters above as delimiters. After that, it goes through the entire list pulled from the text file, and sees if it is in the dictionary variable. If-
# it is in there, it adds 1 to the value associated with that key. If it is not found within the dictionary variable, it adds to as a key to the dictionary variable, and sets its value-
# to 1.
#The above document class will only be used within the actual vectorize function.
def vectorize(filepath):
all_files = os.listdir(filepath)
full_dictionary = {}
for file in all_files:
doc = Document()
full_path = filepath + "\\" + file
textfile = open(full_path, "r", encoding="utf8")
text = textfile.read()
compiled_dictionary = doc.tokenize(text)
final_path = filepath + "\\final" + file
doc.save_the_dictionary(final_path, compiled_dictionary)
for line in text:
all_words = line.upper()
words = word_tokenize(all_words)
punctuation = '''!()-[]{};:'"\,<>./?##$%^&*_~'''
cleaned_words = []
for word in words:
if word not in punctuation:
cleaned_words.append(word)
for word in cleaned_words:
if word in dictionary:
full_dictionary[word] += 1
else:
full_dictionary[word] = 1
Document().save_the_dictionary(filepath + "\\df.txt", full_dictionary)
vectorize("C:\\Users\\******\\Desktop\\*******\\*****\\*****\\Text files")

Faster method for replacing multiple words in a file

I am making a mini-translator for Japanese words for a given file.
The script have an expandable dictionary file that includes 13k+ lines in this format:-
JapaneseWord<:to:>EnglishWord
So I have to pick a line from the dictionary, then do a .strip('') to make a list in this format:-
[JapaneseWord,EnglishWord]
then I have to pick a line from the given file, and find the first item in this list in the line and replace it with its English equivalent, and I have to make sure to repeat this process in the same line for the number of times that Japanese word appears with the .count() function.
the problem is that this takes a long time because like this, I have to read the file again and again for 14k+ times, and this will expand as I expand the dictionary size.
I tried looking for a way to add the whole dictionary in the memory, and then compare them all in the given file at the same time, so like this, I will have to read the file one time, but I couldn't do it.
Here's the function I am using right now, it takes a var that includes the file's lines as a list with the file.readlines() function:-
def replacer(text):
#Current Dictionary.
cdic = open(argv[4], 'r', encoding='utf-8')
#Part To Replace.
for ptorep in cdic:
ptorep = ptorep.strip('\n')
ptorep = ptorep.split('<:to:>')
for line in text:
for clone in range(0, line.count(ptorep[0])):
line = line.replace(ptorep[0], ptorep[1])
text = ''.join(text)
return text
This takes around 1 min for a single small file.
Dictionary Method:
import re
with open(argv[4], 'r', encoding='utf-8') as file:
translations = [line.strip('\n').split('<:to:>') for line in file.readlines()]
translations = {t[0]:t[1] for t in translations} # Convert to dictionary where the key is the english word and the value is the translation
output = []
for word in re.split('\W+'): # Split into words (may require tweaking)
output.append(translations.get(word, word)) # Search for the key `word`, in case it does not exist, use `word`
output = ''.join(output)
Original Method:
Maybe keep the full dictionary in memory as a list:
cdic = open(argv[4], 'r', encoding='utf-8')
translations = []
for line in cdic.readlines():
translations.append(line.strip('\n').split('<:to:>'))
# Note: I would use a list comprehension for this
with open(argv[4], 'r', encoding='utf-8') as file:
translations = [line.strip('\n').split('<:to:>') for line in file.readlines()]
And make the replacements off of that:
def replacer(text, translations):
for entry in translations:
text = text.replace(entry[0], entry[1])
return text

How to fetch these below values in python

I have a file in the below format
.aaa b/b
.ddd e/e
.fff h/h
.lop m/n
I'm trying to read this file. My desired output is if I find ".aaa" I should get b/b, if I find ".ddd" I should get e/e and so on.
I know how to fetch 1st column and 2nd column but I don't know how to compare them and fetch the value. This is what I've written.
file = open('some_file.txt')
for line in file:
fields = line.strip().split()
print (fields[0]) #This will give 1st column
print (fields[1]) # This will give 2nd column
This is not the right way of doing things. What approach follow?
Any time you want to do lookups, a dictionary is going to be your friend.
You could write a function to load the data into a dictionary:
def load_data(filename):
result = dict()
with open(filename, 'r') as f:
for line in f:
k,v = line.strip().split() # will fail if not exactly 2 fields
result[k] = v
return result
And then use it to perform your lookups like this:
data = load_data('foo.txt')
print data['.aaa']
It sounds like what you may want is to build a dictionary mapping column 1 to column 2. You could try:
file = open('some_file.txt')
field_dict = {}
for line in file:
fields = line.strip().split()
field_dict[fields[0]] = fields[1]
Then in your other code, when you see '.ddd' you can simply get the reference from the dictionary (e.g. field_dict['.ddd'] should return 'e/e')
Just do splitting on each line according to the spaces and check whether the first item matches the word you gave. If so then do printing the second item from the list.
word = input("Enter the word to search : ")
with open(file) as f:
for line in f:
m = line.strip().split()
if m[0] == word:
print m[1]

Categories

Resources