parse, find chapters, and write out as separate files

parse, find chapters, and write out as separate files - python

I am having difficulty getting the right code to parse out chapters from this ebook and then have the 27 chapters to print out into their own text file. the farthest i've gotten is to print "CHAPTER-1.txt". I don't want to hard code anything and am unsure where i've completely missed the mark.
infile = open('dracula.txt', 'r')
readlines = infile.readlines()
toc_list = readlines[74:185]
toc_text_lines = []
for line in toc_list:
if len(line) > 1:
stripped_line = line.strip()
toc_text_lines.append(stripped_line)
#print(len(toc_text_lines))
chaptitles = []
for text_lines in toc_text_lines:
split_text_line = text_lines.split()
if split_text_line[-1].isdigit():
chaptitles.append(text_lines)
#print(len(chaptitles))
print(chaptitles)
infile.close()
import re
with open('dracula.txt') as f:
book = f.readlines()
while book:
line = book.pop(0)
if "CHAPTER" in line and book.pop(0) == '\n':
for title in chapters_names_list: ['CHAPTER I.', 'CHAPTER II.',
'CHAPTER III.']
with open("{}.txt".format(chapters_names_list), 'w') :

I think you could benefit from generators, suppose one of the ebooks is too big to fit into memory, you will have some issues.
What you can do is construct sort of a data processing pipeline, first look for the file(ebook.txt) in the filesystem, though have in mind that we need all functions to be as general as possible, once we have the filename we open it and yield one line at a time, and finally we scan each line for 'CHAPTER I.', 'CHAPTER II.', etc
import os
import re
import fnmatch
def find_files(pattern, path):
"""
Here you can find all the filenames that match a specific pattern
using shell wildcard pattern that way you avoid hardcoding
the file pattern i.e 'dracula.txt'
"""
for root, dirs, files in os.walk(path):
for name in fnmatch.filter(files, pattern):
yield os.path.join(root, name)
def file_opener(filenames):
"""
Open a sequence of filenames one at a time
and make sure to close the file once we are done
scanning its content.
"""
for filename in filenames:
if filename.endswith('.txt'):
f = open(filename, 'rt')
yield f
f.close()
def chain_generators(iterators):
"""
Chain a sequence of iterators together
"""
for it in iterators:
# Look up yield from if you're unsure what it does
yield from it
def grep(pattern, lines):
"""
Look for a pattern in a line i.e 'CHAPTER I.'
"""
pat = re.compile(pattern)
for line in lines:
if pat.search(line):
yield line
# A simple way to use these functions together
logs = find_files('dracula*', 'Path/to/files')
files = file_opener(logs)
lines = chain_generators(files)
each_line = grep('CHAPTER I.', lines)
for match in each_line:
print(match)
You can build on top of these implementation to accomplish what you're trying to do.
Let me know if this helped.

Related

How to edit specific line for all text files in a folder by python?

Here below is my code about how to edit text file.
Since python can't just edit a line and save it at the same time,
I save the previous text file's content into a list first then write it out.
For example,if there are two text files called sample1.txt and sample2.txt in the same folder.
Sample1.txt
A for apple.
Second line.
Third line.
Sample2.txt
First line.
An apple a day.
Third line.
Execute python
import glob
import os
#search all text files which are in the same folder with python script
path = os.path.dirname(os.path.abspath(__file__))
txtlist = glob.glob(path + '\*.txt')
for file in txtlist:
fp1 = open(file, 'r+')
strings = [] #create a list to store the content
for line in fp1:
if 'apple' in line:
strings.append('banana\n') #change the content and store into list
else:
strings.append(line) #store the contents did not be changed
fp2 = open (file, 'w+') # rewrite the original text files
for line in strings:
fp2.write(line)
fp1.close()
fp2.close()
Sample1.txt
banana
Second line.
Third line.
Sample2.txt
First line.
banana
Third line.
That's how I edit specific line for text file.
My question is : Is there any method can do the same thing?
Like using the other functions or using the other data type rather than list.
Thank you everyone.

Simplify it to this:
with open(fname) as f:
content = f.readlines()
content = ['banana' if line.find('apple') != -1 else line for line in content]
and then write value of content to file back.

Instead of putting all the lines in a list and writing it, you can read it into memory, replace, and write it using same file.
def replace_word(filename):
with open(filename, 'r') as file:
data = file.read()
data = data.replace('word1', 'word2')
with open(filename, 'w') as file:
file.write(data)
Then you can loop through all of your files and apply this function

The built-in fileinput module makes this quite simple:
import fileinput
import glob
with fileinput.input(files=glob.glob('*.txt'), inplace=True) as files:
for line in files:
if 'apple' in line:
print('banana')
else:
print(line, end='')
fileinput redirects print into the active file.

import glob
import os
def replace_line(file_path, replace_table: dict) -> None:
list_lines = []
need_rewrite = False
with open(file_path, 'r') as f:
for line in f:
flag_rewrite = False
for key, new_val in replace_table.items():
if key in line:
list_lines.append(new_val+'\n')
flag_rewrite = True
need_rewrite = True
break # only replace first find the words.
if not flag_rewrite:
list_lines.append(line)
if not need_rewrite:
return
with open(file_path, 'w') as f:
[f.write(line) for line in list_lines]
if __name__ == '__main__':
work_dir = os.path.dirname(os.path.abspath(__file__))
txt_list = glob.glob(work_dir + '/*.txt')
replace_dict = dict(apple='banana', orange='grape')
for txt_path in txt_list:
replace_line(txt_path, replace_dict)

How to read and process data from a set of text files sequentially?

I have 50 text files (namely Force1.txt, Force2.txt, ..., Force50.txt). The files look like this:
0.0000000e+000 -1.4275799e-003
2.0000000e-002 -1.1012760e-002
4.0000000e-002 -1.0298970e-002
6.0000000e-002 -8.9733599e-003
8.0000000e-002 -9.6871497e-003
1.0000000e-001 -1.2236400e-002
1.2000000e-001 -1.4479739e-002
1.4000000e-001 -1.3052160e-002
1.6000000e-001 -1.1216700e-002
1.8000000e-001 -8.6674497e-003
2.0000000e-001 -8.6674497e-003
2.2000000e-001 -1.3358070e-002
2.4000000e-001 -1.7946720e-002
2.6000000e-001 -1.9782179e-002
I wish to read data from Force1.txt, store data in a list of tuples, and analize these data (the details of such analysis are not relevant to the question). Then I have to do the same with Force2.txt, Force3.txt, and so on.
Here is my attempt:
def load_data(fn):
with open(fn) as f:
lines = f.readlines()
return [tuple(map(float, x)) for x in [row.split() for row in lines]]
def display_data(lst):
return lst.__repr__().replace('[', '').replace(']', '')
pp = []
for file in os.listdir("dir"):
if file.endswith('.txt'):
if file.startswith('Force'):
print os.path.join(r"dir", file)
with open(file) as f:
for line in f:
pp.append(map(float, line.split()))
mdb.models['Model-1'].TabularAmplitude(data=pp, name='Table', smooth=SOLVER_DEFAULT, timeSpan=STEP)
I'm getting this error:
'Table', smooth=SOLVER_DEFAULT, timeSpan=STEP):
Invalid time values, expected monotonically increasing numbers
How can I solve this issue?

This code should do:
import os
def load_data(fn):
with open(fn) as f:
lines = f.readlines()
return [tuple(map(float, x)) for x in [row.split() for row in lines]]
def display_data(lst):
return lst.__repr__().replace('[', '').replace(']', '')
dirname = r"Your dir name goes here"
for filename in os.listdir(dirname):
if filename.endswith('.txt'):
if filename.startswith('Force'):
pathfile = os.path.join(dirname, filename)
print pathfile
pp = load_data(pathfile)
print display_data(pp)
mdb.models['Model-1'].TabularAmplitude(data=pp,
name='Table',
smooth=SOLVER_DEFAULT,
timeSpan=STEP)
You just need to update dirname with the name of the directory which contains the text files. I recommend you not to use file as a variable identifier because file is a reserved word in Python. I've used filename instead.

Python searching for exact word/phrase within a text file

Currently, I'm trying to search for an exact word/phrase in a text file. I am using Python 3.4
Here is the code I have so far.
import re
def main():
fileName = input("Please input the file name").lower()
term = input("Please enter the search term").lower()
fileName = fileName + ".txt"
regex_search(fileName, term)
def regex_search(file,term):
source = open(file, 'r')
destination = open("new.txt", 'w')
lines = []
for line in source:
if re.search(term, line):
lines.append(line)
for line in lines:
destination.write(line)
source.close()
destination.close()
'''
def search(file, term): #This function doesn't work
source = open(file, 'r')
destination = open("new.txt", 'w')
lines = [line for line in source if term in line.split()]
for line in lines:
destination.write(line)
source.close()
destination.close()'''
main()
In my function regex_search I use regex to search for the particular string. However, I don't know how to search for a particular phrase.
In the second function, search, I split the line into a list and search for the word in there. However, this won't be able to search for a particular phrase because I am searching for ["dog walked"] in ['the','dog','walked'] which won't return the correct lines.

edit: Considering that you don't want to match partial words ('foo' should not match 'foobar'), you need to look ahead in the data stream. The code for that is a bit awkward, so I think regex (your current regex_search with a fix) is the way to go:
def regex_search(filename, term):
searcher = re.compile(term + r'([^\w-]|$)').search
with open(file, 'r') as source, open("new.txt", 'w') as destination:
for line in source:
if searcher(line):
destination.write(line)

Reading a file and adding a specific match to a dictionary in python

Id like to read a file for a specific match in the following style "word = word", specifically Im looking to find files with usernames and passwords in them. These files would be scripts created by admins using bad practices with clear credentials being used in logonscripts etc.
The code I have created so far does the job but its very messy and prints an entire line if the match is found (I cant help but think there is a more elegant way to do this). This creates ugly output, id like to print only the match in the line. I cant seem to find a way to do that. If I can create the correct regex for a match of something like the below match, is it possible to only print the match found in the line rather than the entire line?
(I am going to try describe the type of match im looking for)
Key
* = wildcard
- = space
^ = anycharacter until a space
Match
*(U|u)ser^-=-^
dirt = "/dir/path/"
def get_files():
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
if re.match("(.*)(U|u)ser(.*)", line) and re.match("(.*)(=)(.*)", line) or re.match("(.*)(P|p)ass(.*)", line) and re.match("(.*)(=)(.*)", line):
print line
TEST SCRIPT
strComputer = "atl-ws-01"
strNamespace = “root\cimv2”
strUser = "Administrator"
strPassword = "4rTGh2#1"
user = AnotherUser #Test
pass = AnotherPass #test
Set objWbemLocator = CreateObject("WbemScripting.SWbemLocator")
Set objWMIService = objwbemLocator.ConnectServer _
(strComputer, strNamespace, strUser, strPassword)
objWMIService.Security_.authenticationLevel = WbemAuthenticationLevelPktPrivacy
Set colItems = objWMIService.ExecQuery _
("Select * From Win32_OperatingSystem")
For Each objItem in ColItems
Wscript.Echo strComputer & ": " & objItem.Caption
Next
Latest Code after taking on bored the responses
This is the latest code I am using. It seems to be doing the job as expected, apart from the output isnt managed as well as Id like. Id like to add the items into a dictionary. Key being the file name. And two vaules, the username and password. Although this will be added as a separate question.
Thanks all for the help
dirt = "~/Desktop/tmp"
def get_files():
regs = ["(.*)((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
results = dict()
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
m = re.match(combined, line)
if m:
print os.path.join(root, filename)
print m.group(0)
Latest Code output
~/Desktop/tmp/Domain.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs
strUser = "guytom"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts /Logon/logonscript1.vbs
strPassword = "P#ssw0rd1"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strUsername = "guytom2"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strPass = "SECRETPASSWORD"

https://docs.python.org/2/library/re.html
group([group1, ...])
Returns one or more subgroups of the match. If there is a single argument, the result is a single string; if there are multiple arguments, the result is a tuple with one item per argument. Without arguments, group1 defaults to zero (the whole match is returned). If a groupN argument is zero, the corresponding return value is the entire matching string;
match.group(0)

Since you can have many object=value you need to use regular expressions. Here is some sample code for you.
line1 = " someuser = bob "
line2 = " bob'spasswd= secretpassword"
#re.I will do case insensitive search
userMatchObj=re.search('.*user.*=\\s*([\\S]*).*', line1, re.I)
pwdMatchObj=re.search(r'.*pass.*=\s*(.*)', line2, re.I)
if userMatchObj: print "user="+userMatchObj.group(1)
if pwdMatchObj: print "password="+pwdMatchObj.group(1)
output:
user=bob
password=secretpassword
References: https://docs.python.org/2/library/re.html , http://www.tutorialspoint.com/python/python_reg_expressions.htm

Thanks all for the help. Below is my working code (needs further work on the output but the matching is working well)
dirt = "~/Desktop/tmp"
def get_files():
regs = ["(.*)((U|u)ser(.*))(\s=\s\W\w+\W)", "(.*)((U|u)ser(.*))(\s=\s\w+)", "(.*)((P|p)ass(.*))\s=\s(\W(.*)\W)", "(.*)((P|p)ass(.*))(\s=\s\W\w+\W)"]
combined = "(" + ")|(".join(regs) + ")"
results = dict()
for root, dirs, files in os.walk(dirt):
for filename in files:
if filename.endswith(('.bat', '.vbs', '.ps', '.txt')):
readfile = open(os.path.join(root, filename), "r")
for line in readfile:
m = re.match(combined, line)
if m:
print os.path.join(root, filename)
print m.group(0)
Latest Code output
~/Desktop/tmp/Domain.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs
strUser = "guytom"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript1.vbs
strPassword = "P#ssw0rd1"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strUsername = "guytom2"
~/Desktop/tmp/DLsec.local/Policies/{31B2F340-016D-11D2-945F-00C04FB984F9}/USER/Scripts/Logon/logonscript2.bat
strPass = "SECRETPASSWORD"

Python: Regular Expression only works on directories with 1 file in

I have the following function which uses a RE:
def friendSearch():
os.chdir("C:/Users/David/myFiles")
files = os.listdir(".")
for x in files:
inputFile = open(x, "r")
content = inputFile.read()
inputFile.close()
match = re.search(r'(?<="NAME":)("[^"]+")',content)
print (match)
It works fine when the file containing the string is in a directory on its own, but when other files are added to the directory it returns nothing.
Is this because "match" is over written with each file that is processed? If so how can I stop this?
Thanks in advance

You are correct that the issue is match being written over with each file. I am assuming you want a single list with all of the matches from each file, so instead of doing match = ... use matches.extend(...) and initialize matches to an empty list before your loop.
For example:
def friendSearch():
matches = []
os.chdir("C:/Users/Luke/Desktop/Files")
files = os.listdir(".")
for x in files:
inputFile = open(x, "r")
try:
content = inputFile.read()
except UnicodeDecodeError:
continue
inputFile.close()
matches.extend(re.findall(r'(?<="text":)("[^"]+")',content))
print (matches)

Your match will contain search results from the very last file only. This line:
match = re.findall(r'(?<="text":)("[^"]+")',content)
discards what was in match before it. Try this:
match = []
for x in files:
inputFile = open (x, "r")
try:
content = inputFile.read()
except UnicodeDecodeError:
continue
inputFile.close ()
match = match + re.findall (r'(?<="text":)("[^"]+")', content)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

parse, find chapters, and write out as separate files - python

Related

How to edit specific line for all text files in a folder by python?

How to read and process data from a set of text files sequentially?

Python searching for exact word/phrase within a text file

Reading a file and adding a specific match to a dictionary in python

Python: Regular Expression only works on directories with 1 file in

Categories

Resources