Python: Multiple Text Files to Dataframe - python

I'm a little stuck on how exactly to proceed, so a little nudge would be very helpful.
I have ~1800 text files, emails actually, that are in a repeated format.
The structure of each file is as follows:
From: Person-1 [email#person-1.com]
Sent: Tuesday, April 18, 2017 11:24 AM
To: email#person-2.com
Subject: Important Subject
User,
Below is your search alert.
Target: text
Attribute: text
Label: abcdef
Time: Apr 18, 2017 11:24 EDT
Full Text: Text of various length exists here. Some files even have links. I'm not sure how I would capture a varied length field.
Recording: abcde & fghijk lmnop
That's the gist of it.
I would like to write that into a DF I can store as a CSV.
I would like to end up with maybe something like this?
| Target | Attribute | Label | Time | Full Text | Recording | Filename |
|--------|-----------|---------|--------|-------------|-----------|----------|
| text| text| abcdef| (date) |(Full text..)|abcde & f..| 1111.txt |
| text2| text2| abcdef2| (date) |(Full text..)|abcde & f..| 1112.txt |
Where the 2nd row is another text file.
I have code to go through all of the text files and print them. Here's that code:
# -*- coding: utf-8 -*-
import os
import sys
# Take all text files in workingDirectory and put them into a DF.
def convertText(workingDirectory, outputDirectory):
if workingDirectory == "": workingDirectory = os.getcwd() + "\\" # Returns current working directory, if workingDirectory is empty.
i = 0
for txt in os.listdir(workingDirectory): # Iterate through text filess in workingDirectory
print("Processing File: " + str(txt))
fileExtension = txt.split(".")[-1]
if fileExtension == "txt":
textFilename = workingDirectory + txt # Becomes: \PATH\example.text
f = open(textFilename,"r")
data = f.read() # read what is inside
print data # print to show it is readable
#RegEx goes here?
i += 1 # counter
print("Successfully read " + str(i) + " files.")
def main(argv):
workingDirectory = "../Documents/folder//" # Put your source directory of text files here
outputDirectory = "../Documents//" # Where you want your converted files to go.
convertText(workingDirectory, outputDirectory)
if __name__ == "__main__":
main(sys.argv[1:])
I guess I would need RegEx, maybe, to parse the files? What would you recommend?
I am not opposed to using R or something else, if it makes more sense.
Thank You.

Regex should be sufficient for your use case. Using the regex expression r"\sTarget:(.*) you can match everything on the line that matches with Target:, then by creating a list of all the fields you wish to match and iterating over them, you build up a dictionary object that stores the values of each field.
Using the Python CSV library you can create a CSV file and for each .txt file in your directory push a row of the matched dictionary fields with writer.writerow({'Target':'','Attribute':'','Time':'','Filename':'','Label':''})
Example:
import os
import sys
import re
import csv
# Take all text files in workingDirectory and put them into a DF.
def convertText(workingDirectory, outputDirectory):
with open(outputDirectory+'emails.csv', 'w') as csvfile: # opens the file \PATH\emails.csv
fields = ['Target','Attribute','Label','Time','Full Text'] # fields you're searching for with regex
csvfield = ['Target','Attribute','Label','Time','Full Text','Filename'] # You want to include the file name in the csv header but not find it with regex
writer = csv.DictWriter(csvfile, delimiter=',', lineterminator='\n', fieldnames=fields)
writer.writeheader() # writes the csvfields list to the header of the csv
if workingDirectory == "": workingDirectory = os.getcwd() + "\\" # Returns current working directory, if workingDirectory is empty.
i = 0
for txt in os.listdir(workingDirectory): # Iterate through text filess in workingDirectory
print("Processing File: " + str(txt))
fileExtension = txt.split(".")[-1]
if fileExtension == "txt":
textFilename = workingDirectory + txt # Becomes: \PATH\example.text
f = open(textFilename,"r")
data = f.read() # read what is inside
#print(data) # print to show it is readable
fieldmatches = {}
for field in fields:
regex = "\\s" + field + ":(.*)" # iterates through each of the fields and matches using r"\sTarget:(.*) that selects everything on the line that matches with Target:
match = re.search(regex, data)
if match:
fieldmatches[field] = match.group(1)
writer.writerow(fieldmatches) # for each file creates a dict of fields and their values and then adds that row to the csv
i += 1 # counter
print("Successfully read " + str(i) + " files.")
def main(argv):
workingDirectory = "../Documents/folder//" # Put your source directory of text files here
outputDirectory = "../Documents//" # Where you want your converted files to go.
convertText(workingDirectory, outputDirectory)
if __name__ == "__main__":
main(sys.argv[1:])
For processing files this should be fast enough on my machine it took less than a second
Successfully read 1866 files.
Time: 0.6991933065852838
Hope this helps!

Related

Move files into different folders based on a text list Python

I´m new to python and I´ve been trying to simplify a manual task that I do on my daily bases, I have a text file with a list of file names separated in groups by a blank line, like this:
fileName1
fileName2
fileName3
fileName4
fileName5
fileName6
fileName7
fileName8
fileName9
fileName10
fileName11
fileName12
All of this files are in one folder and I want to find each group of files and move them into separate folders the name of the new folders should be the name of the first file of each group.
I´m doing my research and I found how to do each step separately using os and shutil modules but I can´t find a way to join them together and make a beautiful script, any help that I can get from you guys will be awesome, thanks!!
Here's a little script that can do that.
I've made two assumptions:
The file with the list of files is stored in the same directory as source files
There is a blank line after the last file so the script can grab the last group
import os
from shutil import move
from itertools import groupby
#Where the files are originally stored
src_path = "C:\\temp\\src\\"
#Where the group folders will go
dest_path = "C:\\temp\\dest\\"
#Open up the file containing the list of files
with open(src_path + "list_of_files.txt") as txt:
lines = txt.readlines() #Read the file
#Split the contents of the file based on the newline character "\n"
i = (list(g) for _, g in groupby(lines, key='\n'.__ne__))
list_of_groups = [a + b for a, b in zip(i, i)]
#Iterate through each group
for group in list_of_groups:
folder_name = dest_path + group[0].replace("\n","") + "\\"
if not os.path.exists(folder_name):
#Create a folder for each group if it doesn't already exist
os.mkdir(folder_name)
#Move over each file in the group. The last element in the group is a newline character
for file in group:
if file != "\n":
move(src_path + file.replace("\n",""),folder_name + file.replace("\n",""))
When reading a file you can look up characters. Blank spaces have a newline character as represented by \n.
import os
filepath1 = os.getcwd() # current working directory
filepath2 = "path\\to\\where\\you\\want\\dir"
filename = os.path.join(filepath1,"yourfilename.txt")
dirName = []
groupName = "filegroup"
idx = 1
newPath = ""
init = True
file = open(filename, "r")
for line in file:
if init == True: # initial folder
newPath = os.path.join(filepath2,groupName + str(idx))
os.mkdir(newPath)
dirName.append(groupName)
init = False
if line == "\n": # line in file is empty
idx += 1
newName = groupName + str(idx)
dirName.append(newName)
newPath = filepath2 + dirName[idx-1]
os.mkdir(newPath)
else:
os.mkdir(os.path.join(newPath,line.rstrip()))
file.close()

Run a python script on all files in a directory

First time posting a question here, hopefully, someone who experienced/tried this please share your insights... I've been working to get this far in the last few days and nights... now I am getting nowhere to loop this script on every file in a directory.
Bascially, these two scripts work perfectly fine it brings a pdf file and changes it to an excel workbook. Now what I need to do is going through all files from a selected directory and do the same job.
I am keep getting stuck at the opening the file stage - is this saying that the data (the pdf page - data[0]) cant be called in? or should i add more stages in to bring the dataset in...?
Do I have to create a list for the dataset so I can call in the data as you would have more than a data to call in.. is this why python can read the data[0] ???
Revised Script
# import
import os
import glob
import pdftotext
import openpyxl
from pathlib import Path
from string import ascii_uppercase
# open a pdf file
def to_excel(pdf_file):
with open(pdf_file,'rb') as f:
data = pdftotext.PDF(f)
# operate data to get titles, values
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# get an output file name
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# create an excel workbook
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# save the excel workbook
excel_file.save(filename)
excel_file.close
# run a python script every file in a directory
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)
I basically know what you want to do, but your code's indent is not so readable... especially it's python.
Your goal is to create a excel for each pdf file in you prefix dir? or aggregate all the pdf files together to a single excel file?
The follow coding is for the first goal.
Code logic.
get all the pdf file
loop over all the pdf file, for each:
open pdf file
some operation
export to excel file
You full code maybe like this(just guess):
# ----------------import part-------------------
import os
import glob
import pdftotext
import openpyxl
from string import ascii_uppercase
from pathlib import Path
def to_excel(pdf_file):
with open(pdf_file, 'rb') as f: # this open the pdf file
data = pdftotext.PDF(f)
# ---------------operate the data, get title and value-----------
datas = data[0].split('\r\n')
finalData = list()
for item in datas:
if item != '':
finalData.append(item)
finalDataRefined = list()
for item in finalData:
if item != ' BCA Scheduled Maintenance Questions' and item != ' Do you suspect there is Asbestos at the property?' and item != ' Yes' and item != ' No' and item != '\x0c':
finalDataRefined.append(item.strip())
titles = list()
values = list()
for num, item in enumerate(finalDataRefined):
if num % 2 == 0:
titles.append(item)
else:
values.append(item)
# ------------------get output file name---------------------
OPRAST = values[1]
filename = work_dir / f"{OPRAST}.xlxs"
# ------------------create excel file sheet------------------
excel_file = openpyxl.Workbook()
excel_sheet = excel_file.active
excel_sheet.append([])
alphaList = list(ascii_uppercase)
for alphabet in alphaList:
excel_sheet.column_dimensions[alphabet].width = 20
excel_sheet.append(titles)
excel_sheet.append(values)
# --------------------save----------------
excel_file.save(filename)
excel_file.close
# -------------------main program---------------
alphaList = list(ascii_uppercase)
work_dir = Path(r"C:\Users\Sunny Kim\Downloads\Do Forms")
for pdf_file in work_dir.glob("*.pdf"):
to_excel(pdf_file)

How to unzip all folders/files that end in .zip and extract “file.txt” file from each zipped folder

My code currently unzips one zip folder and finds the file called file.txt and extracts it. Now I need to unzip multiple folders that have the extension .zip. I have tried to use code similar to what I need it to do but the problem is that now I have to find a file called file.txt in each of those .zip folders and extract that file only . Also to store file.txt into a separate folder that has the same name where it came from. Thank you in advance for your time.
import re
import os
from zipfile import ZipFile
def pain():
print("\t\t\tinput_files.zip has been unzipped")
with ZipFile('input_files.zip', 'r') as zipObj:
zipObj.extractall()
listOfFileNames = zipObj.namelist()
for fileName in listOfFileNames:
if fileName.endswith('.txt'):
zipObj.extract(fileName, 'storage')
outfile = "output2.txt" #this will be the filename that the code will write to
baconFile = open(outfile,"wt")
file_name1 = "file.txt"
print('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the python shell and this is the way the code should collect the data
baconFile.write('Filename\tLine\tnumber of numbers\tstring separated by a comma\twhite space found\ttab found\tcarriage return found\n') #This prints the master column in the output file and this is the way the code should collect the data
#for filename in os.listdir(os.getcwd() + "/input_files"):
for filename in os.listdir('C:\Users\M29858\Desktop\TestPy\Version10\input_files'):
with open("input_files/" + filename, 'r') as f:
if file_name1 in filename:
output_contents(filename, f, baconFile)
baconFile.close() #closes the for loop that the code is writing to
def output_contents(filename, f, baconFile): #using open() function to open the file inside the directory
index = 0
for line in f:
#create a list of all of the numerical values in our line
content = line.split(',') #this will be used to count the amount numbers before and after comma
whitespace_found = False
tab_found = False
false_string = "False (end of file)"
carriage_found = false_string
sigfigs = ""
index += 1 #adds 1 for every line if it finds what the command wants
if " " in line: #checking for whitespace
whitespace_found = True
if "\t" in line: #checking for tabs return
tab_found = True
if '\n' in line: #checking if there is a newline after the end of each line
carriage_found = True
sigfigs = (','.join(str(len(g)) for g in re.findall(r'\d+\.?(\d+)?', line ))) #counts the sigsfigs after decimal point
print(filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found))) #whatever is inside the .format() is the way it the data is stored into
baconFile.write('\n')
baconFile.write( filename + "\t{0:<4}\t{1:<17}\t{2:<27}\t{3:17}\t{4:9}\t{5:21}"
.format(index, len(content), sigfigs, str(whitespace_found), str(tab_found), str(carriage_found)))
if __name__ == '__main__':
pain()
#THIS WORKS
import glob
import os
from zipfile import ZipFile
def main():
for fname in glob.glob("*.zip"): # get all the zip files
with ZipFile(fname) as archive:
# if there's no file.txt, ignore and go on to the next zip file
if 'file.txt' not in archive.namelist(): continue
# make a new directory named after the zip file
dirname = fname.rsplit('.',1)[0]
os.mkdir(dirname)
extract file.txt into the directory you just created
archive.extract('file.txt', path=dirname)

Check if a file exists using a text file

I have a folder (Molecules) with many sdf files (M00001.sdf, M00002.sdf and so on) representing different molecules. I also have a csv where each row represents the a molecule (M00001, M00002 etc).
I'm writing a code in order to get files on Molecules folder if their name is a row on the csv file.
First attempt
import os
path_to_files = '/path_to_folder/Molecules' # path to Molecules folder
for files in os.listdir(path_to_files):
names = os.path.splitext(files)[0] # get the basename (molecule name)
with open('molecules.csv') as ligs: # Open the csv file of molecules names
for hits in ligs:
if names == hits:
print names, hits
else:
print 'File is not here'
However this returns nothing on the command line (literally nothing). What is wrong with this code?
I am not sure that this is the best way (I only know that the following code works for my data) but if your molecule.csv has the standard csv format, i.e. "molecule1,molecule2,molecule3 ...", you can try to rearrange your code in this way:
import os
import csv
path_to_files = '/path_to_folder/Molecules' # path to Molecules folder
for files in os.listdir(path_to_files):
names = os.path.basename(files)
names = names.replace(".sdf","")
with open('molecules.csv','r') as ligs:
content = csv.reader(ligs)
for elem in content:
for hits in elem:
if names == hits:
print names, hits
else:
print 'File is not here'
See csv File Reading and Writing for csv module
I solved the problem with a rather brute approach
import os
import csv
import shutil
path_to_files = None # path to Molecules folder
new_path = None # new folder to save files
os.mkdir(new_path) # create the folder to store the molecules
hits = open('molecules.csv', 'r')
ligands = []
for line in hits:
lig = line.rstrip('\n')
ligands.append(lig)
for files in os.listdir(path_to_files):
molecule_name = os.path.splitext(files)[0]
full_name = '/' + molecule_name + '.sdf'
old_file = path_to_files + full_name
new_file = new_path + full_name
if molecule_name in ligands:
shutil.copy(old_file, new_file)

Python - stuck at reading a specific row from a csv

I need to add columns to a "matched" shape file based on a csv. I have one last step to complete which is to get the value to enter into the shp from the csv.
I get
readCSV[rowID] Traceback (most recent call last): File "", line 1, in TypeError: '_csv.reader'
object is not subscriptable
The stripped down CSV is
The files look like
The code mataches OVL_CAT + OVL2_DESC to the File Name.
I then get the code to add a column called LGA_CODE and need to populate it with '583094' which is row 2, column 1...how do I get this when I can't call FileList2 to get row 2 from the csv (3 in the example below but 2 in python)?
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 2
Compare_Column2 = 3
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rowID=rowID+1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if str(status) == 'True':
List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile
#print SearchString
if str(SearchString in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_list=header.split(delimiter)
# Process matching files
for fl in List:
header_count=0
for header in header_list:
dfStore=fl
#arcpy.AddField_management(dfStore, str(header) ,'TEXT')
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
if field.endswith(filename):
rowID=field[:field.find('_')]
with open(AOI, 'rb') as f:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
## arcpy.CalculateField_management(fl, header, text,"PYTHON_9.3")
=== UPDATED CODE BASED ON COMMENTS -it's all working find if anyone needs it.
import os, sys, datetime, csv, arcpy, string
from subprocess import Popen
from itertools import islice
top = os.getcwd() # change to a specific path if required.
# This otherwise starts with the directory the script is in (preferred).
RootOutput = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch' # change if you want output somewhere else
top=RootOutput
SourceDIR = r'P:\2012\273_CCRC_Townplanning_Datasets\Working\scratch\OM_011' # source of your data (subdirectories searched as well
outDIR = top+"\\workingFiles" # directory where output is written to. Includes temp files
finalDIR = top+"\\final" # folder for final data only
AOI = 'AOI.csv' # name of the file containing the las file names in the second column
Compare_Column = 3
Compare_Column2 = 4
# END setting base paths
# NOTHING BELOW should need editing.
FileTypes=['shp']
SearchStrings=[]
filecount=0
List =[]
count=0
x=0
os.chdir(top)
#Generate list with unique file name codes from CSV
FileList = csv.reader(open(AOI))
SearchStrings=[]
rows=[]
#FinalList=[]
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
#FinalList.append()
rowID+=1
for root, dirs, files in os.walk(SourceDIR, topdown=False):
for fl in files:
currentFile=os.path.join(root, fl)
for FileType in FileTypes:
status= str.endswith(currentFile,FileType)
if status:
#List.append(currentFile)
for SearchString in SearchStrings:
#print currentFile, SearchString
if str(SearchString[SearchString.find('_')+1:] in currentFile) == 'True':
#print str(currentFile)+str(status)
List.append(currentFile)
filecount=filecount+1
#del fl
# Get list of Column Names
headers_count = 1
with open(AOI) as fin:
headers = list(islice(fin, headers_count))
delimiter=','
header=str(headers)
header_listT=header.split(delimiter)
header_list=[]
for hdr in header_listT:
header_list.append(arcpy.ValidateTableName(hdr)[:10])
# Process matching files
columnID=1
for fl in List:
header_count=0
for header in header_list:
print header
dfStore=fl
try:
arcpy.AddField_management(dfStore, str(header) ,'TEXT')
except:
pass
# Get RowID to read column data from
filename=fl[fl.rfind('\\')+1:fl.rfind('_')]
for field in SearchStrings:
#print field, filename
#print header, field
if field.endswith(filename):
#print 'FOUND......................'
column_count=len(fl)
if columnID < len(header_list):
rowID=int(field[:field.find('_')])
text = rows[rowID][columnID]
print filename, header, text
columnID+=1
arcpy.CalculateField_management(fl, header, "text" ,"PYTHON_9.3")
#arcpy.CalculateField_management("P:/2012/273_CCRC_Townplanning_Datasets/Working/scratch/OM_011/OM_011_Waterway_Envelopes_ccrc.shp","LGA_CODE","5","PYTHON","#")
Your problem is in these two lines:
readCSV= csv.reader(f)
text=readCSV[rowID][1]
csv.reader is an iterable over the lines of the file; it cannot be directly indexed. You could use islice to get the element you want (islice(readCSV, rowID, rowID+1).next()), though a neater solution would just be to store a dictionary mapping rowID to the AOI row when you read it the first time (in the SearchStrings loop):
FileList = csv.reader(open(AOI))
SearchStrings = []
rows = []
rowID=0
for File in FileList:
#SearchStrings.append(File[0]+","+File[1])
SearchStrings.append(str(rowID)+'_'+File[Compare_Column]+'_'+File[Compare_Column2])
rows.append(File)
rowID=rowID+1
... # later
rowID=int(field[:field.find('_')])
text = rows[rowID][1]

Categories

Resources