Python Count paragraph - python

Hello all so i've been tasked to count lines and paragraphs. Counting every line is obviously easy but im stuck on counting the paragraphs. If a paragraph has no character it will give back the number zero and for every paragraph is an increment higher. For example an input file is: Input and an Output should come out Output
so my code is:
def insert_line_para_nums(infile, outfile):
f = open(infile, 'r')
out = open(outfile, 'w')
linecount = 0
for i in f:
paragraphcount = 0
if '\n' in i:
linecount += 1
if len(i) < 2: paragraphcount *= 0
elif len(i) > 2: paragraphcount = paragraphcount + 1
out.write('%-4d %4d %s' % (paragraphcount, linecount, i))
f.close()
out.close()

def insert_line_para_nums(infile, outfile):
f = open(infile, 'r')
out = open(outfile, 'w')
linecount = 0
paragraphcount = 0
empty = True
for i in f:
if '\n' in i:
linecount += 1
if len(i) < 2:
empty = True
elif len(i) > 2 and empty is True:
paragraphcount = paragraphcount + 1
empty = False
if empty is True:
paragraphnumber = 0
else:
paragraphnumber = paragraphcount
out.write('%-4d %4d %s' % (paragraphnumber, linecount, i))
f.close()
out.close()

This is one way to do it, and not the prettiest.
import re
f = open('a.txt', 'r')
paragraph = 0
lines = f.readlines()
for idx, line in enumerate(lines):
if not line == '\n':
m = re.search(r'\w', line)
str = m.group(0)
try:
# if the line is a newline, and the previous line has a str in it, then
# count it as a paragraph.
if line == '\n' and str in lines[idx-1]:
paragraph +=1
except:
pass
if lines[-1] != '\n': # if the last line is not a new line, count a paragraph.
paragraph +=1
print paragraph

Related

Enter single digit but the result have the tens digit when use python search value

When I input 7,17 but the result is
ifDescr.7
ifDescr.70
ifDescr.17
If I want the result is 7 and 17 when I input 7 17, how do I code it?
ifDescr.7
ifDescr.17
text file
ifDescr.7
ifDescr.70
ifDescr.17
def search_multiple(file_name, list_of_strings):
line_number = 0
list_of_results = []
with open(file_name, 'r') as read:
for line in read:
line_number += 1
for x in list_of_strings:
if x in line:
list_of_results.append((x,line_number,line.rstrip()))
return list_of_results
def main ():
folder = ('single.txt')
verify1,verify2 = input ("Input number").split()
matched_lines = search_multiple(folder,['ifDescr.' + verify1, 'ifDescr.' + verify2,])
for x in matched_lines:
print('Line = ', x[2])
if __name__ == '__main__':
main()
The reason for this behavior is you are using in to check if string is in the line. As ifDescr.70 contains ifDecsr.7 in it,the result contains it as well. Try out the below function:
def search_multiple(file_name, list_of_strings):
line_number = 0
list_of_results = []
with open(file_name, 'r') as read:
for line in read:
line_number += 1
for x in list_of_strings:
if x == line.strip():
list_of_results.append((x,line_number,line.rstrip()))
return list_of_results

Read the file and split by the sum of words, upload to a folder

I figured out how to count words, how to split them - how to combine + put a ***** mark at the end of a sentence for > 200 and < 250 words (after the dot)???
# text counting method
file = open("test.txt", "r")
number_of_lines = 0
number_of_words = 0
number_of_characters = 0
for line in file:
line = line.strip("\n")
words = line.split()
number_of_lines += 1
number_of_words += len(words)
number_of_characters += len(line)
file.close()
print("lines:", number_of_lines, "words:", number_of_words, "characters:", number_of_characters, ( "sum_chsrters:", number_of_characters // 1000, "sum_wolds:", number_of_words // 120 ))
# text splitting method
with open('test.txt') as fo:
op = ''
start = 0
cntr = 1
for x in fo.read().split("\n"):
if (x=='*****'):
if (start==1):
with open(str(cntr) + '.txt', 'w') as opf:
opf.write(op)
opf.close()
op=''
cntr+=1
else:
start=1
else:
if (op==''):
op = x
else:
op = op + '\n' + x
fo.close()
P.S - I'm a beginner. It needs to be split file (sum +10,000 characters) into 10 parts (1000 characters or 120 words each) and put in a folder (01.txt, 02.txt, ...,10.txt)

How to split file with certain conditions for each end line of each file

I have a .txt file like this:
2019-03-29 12:03:07 line1
line2
line3
....
2019-03-30 07:05:09 line1
line2
....
2019-03-31 10:03:20 line1
line2
....
I split the file into several files, like this:
inputData = 'dirname\..'
numThrd = 3
def chunkFiles():
nline = sum(1 for line in open(inputData,'r', encoding='utf-8', errors='ignore'))
chunk_size = math.floor(nline/int(numThrd))
n_thread = int(numThrd)
j = 0
with open(inputData,'r', encoding='utf-8', errors='ignore') as fileout:
for i, line in enumerate(fileout):
if (i + 1 == j * chunk_size and j != n_thread) or i == nline:
out.close()
if i + 1 == 1 or (j != n_thread and i + 1 == j * chunk_size):
chunkFile = 'rawData' + str(j+1) + '.txt'
if os.path.isfile(chunkFile ):
break
out = open(chunkFile , 'w+', encoding='utf-8', errors='ignore')
j = j + 1
fLine = line[:-1]
if not matchLine:
if out.closed != True:
out.write(line)
if i % 1000 == 0 and i != 0:
print ('Processing line %i ...' % (i))
However, I want the split file to meet the condition that the last line in the chunk file must be right before the line that has the date.
recent output that I got:
rawData1.txt
2019-03-29 12:03:07 line1
line2
....
-------------------------
rawData2.txt
line50
line51
2019-03-30 07:05:09 line1
line2
.....
Desired output:
rawData1.txt
2019-03-29 12:03:07 line1
line2
line3
....
-------------------------
rawData2.txt
2019-03-30 07:05:09 line1
line2
....
what should I add to the script above to meet that conditions?
Thank you very much
You can produce the desired output by using a list to hold the lines you want to write (see below).
def write_chunk(filename, chunk):
with open(filename, "w") as out:
for i in chunk:
out.write(i)
chunk = []
n_chunk = 1
with open("data.txt") as f:
for line in f:
if not line[0].isspace() and chunk:
write_chunk("{}.txt".format(n_chunk), chunk)
chunk = []
n_chunk += 1
chunk.append(line)
# write final chunk
write_chunk("{}.txt".format(n_chunk), chunk)

Stop replacement by pattern?

Say my file look like this:
some lines
tom
some lines
beginword a b
some lines
endword
jim
some lines
beginword x y
some lines
endword
...
Want to be:
some lines
tom
some lines
beginword ZZ b
some lines
endword
jim
some lines
beginword x y
some lines
endword
So this is my python code:
input = open("file", "r")
output = open("file_updated", "w")
dummy = ""
item = []
for line in input:
dummy += line
if line.find("tom" + "\n") != -1:
for line in input:
if line.find("beginword") != -1:
item = line.split()
dummy += item[0] + " w " + item[-1] + "\n"
else:
dummy += line
output.write(dummy)
input.close()
output.close()
It replace all lines contain "beginword", include the lines belong to "jim", how can I stop the replacement by "endword" belong to "tom"?
Use break statement
input = open("file", "r")
output = open("file_updated", "w")
dummy = ""
item = []
for line in input:
dummy += line
if line.find("tom" + "\n") != -1:
for line in input:
# check for endword and exit for loop
if line.find("endword" + "\n") == 0:
dummy += line
break
if line.find("beginword") != -1:
item = line.split()
dummy += item[0] + " w " + item[-1] + "\n"
else:
dummy += line
output.write(dummy)
input.close()
output.close()
Also, have a look at these:
reading and writing files
regular expressions

Python - how to print amount of numbers, periods, and commas in file

def showCounts(fileName):
lineCount = 0
wordCount = 0
numCount = 0
comCount = 0
dotCount = 0
with open(fileName, 'r') as f:
for line in f:
words = line.split()
lineCount += 1
wordCount += len(words)
for word in words:
# ###text = word.translate(string.punctuation)
exclude = set(string.punctuation)
text = ""
text = ''.join(ch for ch in text if ch not in exclude)
try:
if int(text) >= 0 or int(text) < 0:
numCount += 1
# elif text == ",":
# comCount += 1
# elif text == ".":
# dotCount += 1
except ValueError:
pass
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
print("Number count: " + str(numCount))
print("Comma count: " + str(comCount))
print("Dot count: " + str(dotCount) + "\n")
Basically it will show the number of lines and the number of words, but I can't get it to show the number of numbers, commas, and dots. I have it read a file that the user enters and then show the amount of lines and words, but for some reason it says 0 for numbers commas and dots. I commented out the part where it gave me trouble. If i remove the comma then i just get an error. thanks guys
This code loops over every character in each line, and adds 1 to its variable:
numCount = 0
dotCount = 0
commaCount = 0
lineCount = 0
wordCount = 0
fileName = 'test.txt'
with open(fileName, 'r') as f:
for line in f:
wordCount+=len(line.split())
lineCount+=1
for char in line:
if char.isdigit() == True:
numCount+=1
elif char == '.':
dotCount+=1
elif char == ',':
commaCount+=1
print("Number count: " + str(numCount))
print("Comma count: " + str(commaCount))
print("Dot count: " + str(dotCount))
print("Line count: " + str(lineCount))
print("Word count: " + str(wordCount))
Testing it out:
test.txt:
Hello, my name is B.o.b. I like biking, swimming, and running.
I am 125 years old, and I was 124 years old 1 year ago.
Regards,
B.o.b
Running:
bash-3.2$ python count.py
Number count: 7
Comma count: 5
Dot count: 7
Line count: 6
Word count: 27
bash-3.2$
Everything makes sense here, except the lineCount the reason why this is 6 is because of newlines. In my editor (nano), it adds a newline to the end of any file by default. So just imagine the text file to be this:
>>> x = open('test.txt').read()
>>> x
'Hello, my name is B.o.b. I like biking, swimming, and running.\n\nI am 125 years old, and I was 124 years old 1 year ago.\n\nRegards,\nB.o.b \n'
>>> x.count('\n')
6
>>>
Hope this helps!
For the punctuations, why not just do:
def showCounts(fileName):
...
...
with open(fileName, 'r') as fl:
f = fl.read()
comCount = f.count(',')
dotCount = f.count('.')
You could use the Counter class to take care of it you:
from collections import Counter
with open(fileName, 'r') as f:
data = f.read().strip()
lines = len(data.split('\n'))
words = len(data.split())
counts = Counter(data)
numbers = sum(v for (k,v) in counts.items() if k.isdigit())
print("Line count: {}".format(lines))
print("Word count: {}".format(words))
print("Number count: {}".format(numbers))
print("Comma count: {}".format(counts[',']))
print("Dot count: {}".format(counts['.']))

Categories

Resources