trying to create a dictionary from a text file but - python

so, I have text file (a paragraph) and I need to read the file and create a dictionary containing each different word from the file as a key and the corresponding value for each key will be an integer showing the frequency of the word in the text file.
an example of what the dictionary should look like:
{'and':2, 'all':1, 'be':1, 'is':3} etc.
so far I have this,
def create_word_frequency_dictionary () :
filename = 'dictionary.txt'
infile = open(filename, 'r')
line = infile.readline()
my_dictionary = {}
frequency = 0
while line != '' :
row = line.lower()
word_list = row.split()
print(word_list)
print (word_list[0])
words = word_list[0]
my_dictionary[words] = frequency+1
line = infile.readline()
infile.close()
print (my_dictionary)
create_word_frequency_dictionary()
any help would be appreciated thanks.

Documentation defines collections module as "High-performance container datatypes". Consider using collections.Counter instead of re-inventing the wheel.
from collections import Counter
filename = 'dictionary.txt'
infile = open(filename, 'r')
text = str(infile.read())
print(Counter(text.split()))
Update:
Okay, I fixed your code and now it works, but Counter is still a better option:
def create_word_frequency_dictionary () :
filename = 'dictionary.txt'
infile = open(filename, 'r')
lines = infile.readlines()
my_dictionary = {}
for line in lines:
row = str(line.lower())
for word in row.split():
if word in my_dictionary:
my_dictionary[word] = my_dictionary[word] + 1
else:
my_dictionary[word] = 1
infile.close()
print (my_dictionary)
create_word_frequency_dictionary()

If you are not using version of python which has Counter:
>>> import collections
>>> words = ["a", "b", "a", "c"]
>>> word_frequency = collections.defaultdict(int)
>>> for w in words:
... word_frequency[w] += 1
...
>>> print word_frequency
defaultdict(<type 'int'>, {'a': 2, 'c': 1, 'b': 1})

Just replace my_dictionary[words] = frequency+1 with my_dictionary[words] = my_dictionary[words]+1.

Related

Extracting ip address from a file in python and counting how many times the ip shows up

I have to use a text file and extract the most frequent ip address and count how many times they come up
def anaylse_log(parameter):
myfile = open("sample_log_1 test.txt", "r")
iPdata = myfile.readlines()
mydict = {}
ipAddress = []
item_list = []
result_file = []
counter = ()
def extract_log(myfile):
#split the file line by line
for line in myfile:
splitData = line.split()
ipAddress = splitData[0]
numbers = splitData[1]
ipAddress.append(ipAddress)
numbers.append(numbers)
if numbers in mydict:
#if numbers is already a key in the dictionary
#increase the count
mydict[numbers] += 1
else:
# Otherwise if it's not yet in the dictionary
# Initialise it to 1
mydict[numbers] = 1
return numbers
myfile.close()
def find_most_frequent(maximum,iPdata):
with open("sample_log_1 text", "r") as myfile:
for text in myfile:
if str(maximum) in text:
return maximum
with open("resultss.csv", "w") as file:
file.write(maximum(maximum))
#This will put the dictionary into tuples and give each key a value
item_list = [(k, v) for k, v in mydict.items()]
#This will sort the list by v
item_list.sort(key=lambda x:x[1], reverse=True)
maximum = mydict()
def main(myfile,mydict,iPdata):
result_file = open("resultss.csv", "w")
main()
i had to fix the spacing for the code to be edited, i hope this is ok and you are able to run it, i have stuck on this for a while and i thought i was calling the functions too
Suppose your log file is like
15.25.7.3
25.25.2.5
25.25.2.5
115.25.7.3
215.25.7.3
25.25.2.5
Here is a simple way to count ips
ip_count_dict = {}
with open('ip.log', 'r') as f:
ip_file = f.read()
# if separated by coma
# ip_list = ip_file.split(',')
# if separated by \n new line
ip_list = ip_file.splitlines()
for ip in ip_list:
ip = ip.strip()
if ip in ip_count_dict:
ip_count_dict[ip] += 1
else:
ip_count_dict[ip] = 1
print(ip_count_dict)
Output: {'15.25.7.3': 1, '25.25.2.5': 3, '115.25.7.3': 1, '215.25.7.3': 1}
Instead of manually counting IPs as you loop through your log, try this:
from collections import Counter
log_entries = open("resultss.csv").read().split("\n")
ip_list = [log.split(",")[0] for log in log_entries]
counts = Counter(ip_list)
print(counts)
This works with a CSV file format like:
10.10.10.1,asdf,31
5.9.7.11,aajbczxz,54
5.9.7.11,zzzzz,2

How to save words that occur no more than 3 times in a text? Reading and writing files

I am working on a text file right now that is called "dracula.txt", and I have to do the following in python:
Save words that occur no more than 3 times in descending order in a file called less_common_words.txt. Each word with its count should be saved on a separate line.
I would appreciate any help! I've been working on this for too long.
I have already tokenized my file and counted the words. This is my code so far:
file = open("C:/Users/17733/Downloads/dracula.txt", 'r', encoding = 'utf-8-sig')
data = file.read()
data
data_list = data.split('\n')
data_list
new_list = []
for i in data_list:
if i !='':
ans_here = i.split(' ')
new_list.extend(ans_here)
new_list
import string
import re
puncs = list(string.punctuation)
puncs.append('"')
puncs.append('[')
puncs.append('.')
puncs.append('-')
puncs.append('_')
#append each seperately
new_2 = []
for i in new_list:
for p in puncs:
if p in i:
i_new = i.replace(p, ' ')
new_2.append(i_new)
new_2
new_2 = [i.replace(' ', ' ').strip().lower() for i in new_2]
new_2
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
output_filepath = Path('outfile.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.lower().strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
less_common_words = sorted([
key for key, value in Counter(word_list).items() if value <= 3
],reverse=True)
with open(output_filepath, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(less_common_words))
This should exactly be what you need- I fixed my previous error by flattening the entire txt into a 2d list:
book_open = open('frankenstein.txt', 'r').readlines()
beauty_book = [i.split() for i in book_open]
flatten = []
for sublist in beauty_book:
for val in sublist:
flatten.append(val)
foo = 0
for i in flatten:
list_open = open('less_common_words.txt', 'r').readlines()
beauty_list = [i.replace('\n', '') for i in list_open]
count = flatten.count(flatten[foo])
compile = str((flatten[foo], count))
if count <= 3:
if compile not in beauty_list:
file = open('less_common_words.txt', 'a+')
file.write('\n'+compile)
file.close()
foo += 1

Print from High to Low occurrences of Dictionary in Python

I have a code that counts every word in a file and counts how many times they occurred.
filename = "test.txt"
output = []
with open(filename) as f:
content = f.readlines()
content = [x.strip() for x in content]
wordlist = {}
for line in content:
for entry in line.split():
word = entry.replace('.', '')
word = word.replace(',', '')
word = word.replace('!', '')
word = word.replace('?', '')
if word not in wordlist:
wordlist[word] = 1
else:
wordlist[word] = wordlist[word] + 1
print(wordlist)
However, when I print this, I am not able to specify to go from high to low occurrences.
Here is a test file.
hello my friend. hello sir.
How do I print such that it looks like
hello: 2 (newline)
my: 1
etc?
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
for key,value in Counter(word_list).items():
print(f'{key} : {value}')
In python3.7 and up the dict preserve the insertion order. So we can just sort the dictionary item by values and then insert(or create) in new dict.
Use:
print(dict(sorted(wordlist.items(), key = lambda x: -x[1])))

Python: Problem when making a dict from a text file

My text file looks like this:
comp_1-item_14,
comp_2-item_1,item_7,item_35
comp_3-item_4,item_7,item_10,item_1,item_2
I want to make a dictionary from the text file. It should look like
{"comp_1": ("item_14"), "comp_2": ("item_1","item_7","item_35")}
How can i delete the '-' from this and fix it? My code is so:
d = {}
with open('pr.txt', 'r') as p:
for line in r:
split = line.split()
d[split[0]] = "-".join(split[0:])
print(d)
I just tried with a single line and it works fine :
line = "comp_3-item_4,item_7,item_10,item_1,item_2"
d = {}
line_list = line.split('-')
d[line_list[0]] = tuple(line_list[1].split(','))
print(d)
Output :
{'comp_3': ('item_4', 'item_7', 'item_10', 'item_1', 'item_2')}
Change split = line.split() to split = line.split('-')
try this,
d = {}
with open('pr.txt', 'r') as f:
for l in f.readlines():
split_ = l.strip().split("-")
d[split_[0]] = tuple(x for x in split_[1].split(",") if x)
{'comp_1': ('item_14',), 'comp_2': ('item_1', 'item_7', 'item_35')...}
d = {}
with open('pr.txt', 'r') as p:
for line in p:
s = line.strip().split('-')
d[s[0]] = "".join(s[1:])
print(d)

How to read from file after some 'mark'?

For example, if I have some text / log file with very simple structure, where here is a few different parts of it, with different structure, and splitted by some mark line, e.g.:
0x23499 0x234234 0x234234
...
0x34534 0x353454 0x345464
$$$NEW_SECTION$$$
4345-34534-345-345345-3453
3453-34534-346-766788-3534
...
So, how I can read file by these parts? E.g. read file in one variable before that $$$NEW_SECTION$$$ mark, and after it (without using regexps, etc). Are here any simple solutions for that?
Here is the solution without reading the whole file into memory:
data1 = []
pos = 0
with open('data.txt', 'r') as f:
line = f.readline()
while line and not line.startswith('$$$'):
data1.append(line)
line = f.readline()
pos = f.tell()
data2 = []
with open('data.txt', 'r') as f:
f.seek(pos)
for line in f:
data2.append(line)
print data1
print data2
The first iteration can't be made with for line in f not to spoil the accurate position in the file.
The simplest solution is str.split
>>> s = filecontents.split("$$$NEW_SECTION$$$")
>>> s[0]
'0x23499 0x234234 0x234234\n\n0x34534 0x353454 0x345464\n'
>>> s[1]
'\n4345-34534-345-345345-3453\n3453-34534-346-766788-3534'
Solution 1:
If file is not very-big then:
with open('your_log.txt') as f:
parts = f.read().split('$$$NEW_SECTION$$$')
if len(parts) > 0:
part1 = parts[0]
...
Solution 2:
def FileParser(filepath):
with open(filepath) as f:
part = ''
while(line = f.readline()):
part += line
if (line != '$$$NEW_SECTION$$$'):
returnpart = part
part = ''
yield returnpart
for segment in FileParser('your_log.txt'):
print segment
Note: it is untested code so please validate before using it
Solution:
def sec(file_, sentinel):
with open(file_) as f:
section = []
for i in iter(f.readline, ''):
if i.rstrip() == sentinel:
yield section
section = []
else:
section.append(i)
yield section
and use:
>>> from pprint import pprint
>>> pprint(list(sec('file.txt')))
[['0x23499 0x234234 0x234234\n', '0x34534 0x353454 0x345464\n'],
['4345-34534-345-345345-3453\n',
'3453-34534-346-766788-3534\n',
'3453-34534-346-746788-3534\n']]
>>>
sections to variables or best sections to dict:
>>> sections = {}
>>> for n, section in enumerate(sec('file.txt')):
... sections[n] = section
>>>

Categories

Resources