Copy specific lines from multiple text files to an excel file - python

I have as many as 1500 text files and I want to copy 5 lines from every text file, say line 4,5,9,14 and 32. I want to make columns of these files in an excel sheet one below the other, of the 1500 text files. I have figured out a code that takes in only one txt file but copies all the data into rows. Any help will be appreciated.
Here is my code:
import csv
import xlwt
import os
import sys
# Look for input file in same location as script file:
inputfilename = os.path.join(os.path.dirname(sys.argv[0]),
'C:/path/filename.txt')
# Strip off the path
basefilename = os.path.basename(inputfilename)
# Strip off the extension
basefilename_noext = os.path.splitext(basefilename)[0]
# Get the path of the input file as the target output path
targetoutputpath = os.path.dirname(inputfilename)
# Generate the output filename
outputfilename = os.path.join(targetoutputpath, basefilename_noext + '.xls')
# Create a workbook object
workbook = xlwt.Workbook()
# Add a sheet object
worksheet = workbook.add_sheet(basefilename_noext, cell_overwrite_ok=True)
# Get a CSV reader object set up for reading the input file with tab
delimiters
datareader = csv.reader(open(inputfilename, 'rb'),
delimiter='\t', quotechar='"')
# Process the file and output to Excel sheet
for rowno, row in enumerate(datareader):
for colno, colitem in enumerate(row):
worksheet.write(rowno, colno, colitem)
# Write the output file.
workbook.save(outputfilename)
# Open it via the operating system (will only work on Windows)
# On Linux/Unix you would use subprocess.Popen(['xdg-open', filename])
os.startfile(outputfilename)

You would first need to put all of your required text files in the current folder, glob.glob('*.txt') could then be used to get a list of these filenames. For each text file, read the files in using readlines() and extract the required lines using itemgetter(). For each file, create a new row in your output worksheet and write each line as a different column entry.
import xlwt
import glob
import operator
# Create a workbook object
wb = xlwt.Workbook()
# # Add a sheet object
ws = wb.add_sheet('Sheet1', cell_overwrite_ok=True)
rowy = 0
for text_filename in glob.glob('*.txt'):
with open(text_filename) as f_input:
try:
lines = [line.strip() for line in operator.itemgetter(4, 5, 9, 14, 32)(f_input.readlines())]
except IndexError as e:
print "'{}' is too short".format(text_filename)
lines = []
# Output to Excel sheet
for colno, colitem in enumerate(lines):
ws.write(rowy, colno, colitem)
rowy += 1
# Write the output file.
wb.save('output.xls')

Related

How to copy multiple .xlsx files into a respective .csv file?

I have 24 excel files, I'm aiming to copy the .xslx data and to their respective 24 .csv files. I have copied the data over however its creating 10 copies in the .csv files, I believe it has something to do with the for loops. Ive tried to use writerow() rather than writerows() yet that does help. I'm trying to understand openpyxl and its writer and reader objects.
import openpyxl, os, csv
from pathlib import Path
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
wb = openpyxl.load_workbook('./excelspreadsheets/' + excelFile)
for sheetName in wb.sheetnames:
# Loop through every sheet in the workbook
sheet = wb[sheetName]
sheetTitle = sheet.title
# Create the CSV filename from the Excel filename and sheet title
p = Path(excelFile)
excelFileStemName = p.stem
CsvFilename = excelFileStemName + '_' + sheetTitle + '.csv'
# Create the csv.writer object for this CSV file
print(f'Creating filename {CsvFilename}...')
outputFile = open(CsvFilename, 'w', newline='')
outputWriter = csv.writer(outputFile)
# Create reader object for each excel sheet
fileObj = open('./excelspreadsheets/' + excelFile)
fileReaderObj = csv.reader(fileObj)
# Loop through every row in the excel sheet
for rowNum in range(1, sheet.max_row + 1):
rowData = [] # append each cell to this list
# Loop through each cell in the row
for colNum in range(1, sheet.max_column + 1):
rowData.append(sheet.values)
# write the rowData list to the CSV file.
for row in rowData:
outputWriter.writerows(row)
outputFile.close()
So, each of the newly created .csv files writes the correct data but does it 10 times, rather than once.
Appreciate any feedback thanks.
You can use read_excel and to_csv, which come as part of pandas to read excel file and write the data to csv file. It is just simpler from coding perspective, as the read and write will be done in one line. It also uses Openpyxl underneath. The updated code is below.
import openpyxl, os, csv
from pathlib import Path
import pandas as pd
for excelFile in os.listdir('./excelspreadsheets'):
if excelFile.endswith('.xlsx'): # Skip non xlsx files, load the workbook object
xls = pd.ExcelFile('./excelspreadsheets/' + excelFile)
for sheetname in xls.sheet_names:
#Read each sheet into df
df = pd.read_excel('./excelspreadsheets/' + excelFile, sheetname)
#Remove .xlsx from filename and create CSV name
CsvFilename = excelFile.rstrip('.xlsx') + '_' + sheetname + '.csv'
print(f'Creating filename {CsvFilename}...')
#Write df as CSV to file
df.to_csv(CsvFilename, index=False)
Let me know if you see any errors...

Writing and appending multiple csv data into new csv using python

I have a directory where there are multiple csv files. Currently I am able to read all the files sequentially using for loop and display their contents.
I need to to write the contents from all the csv files sequentially into a new csv file but I am missing something as in my new csv has no data in it.
this is what I am doing :
import os
import csv
path = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\\'
fileNames = os.listdir(path)
for f in fileNames:
file = open(path+f)
csvreader = csv.reader(file)
rows = []
for row in csvreader:
rows.append(row)
for i in rows:
print(i)
#OFile = open('C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv','w')
writer = csv.writer(open('C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv', 'wb'))
#for row in csvreader:
# row1 = csvreader.next()
writer.writerow(i)
You are overwriting the file each row you try to write.
Using the w argument for the open method will overwrite existing files.
The argument you need to use in the case you want to append to files (or create new files if non-existing) is a
See Python File Open for more informations about python file modes.
import os
import csv
path = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ'
fileNames = os.listdir(path)
with open('C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv', 'a') as output:
writer = csv.writer(output)
for f in fileNames:
with open(os.path.join(path, f), "r") as file:
csvreader = csv.reader(file)
for row in csvreader:
print(row)
writer.writerow(row)
If the csv files have the same columns and formats you could also simply copy the first file and append the others, excluding their headers.
import os
import shutil
path = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ'
fileNames = os.listdir(path)
output = r'C:\Users\hu170f\Documents\WORK\MAAP_FILE_DB_REQ\ALL_DATA.csv'
# Copy the first file:
shutil.copyfile(os.path.join(path,fileNames[0]), output)
# Append the remaining file contents, excluding each first line
with open(output, 'a') as out:
for file in fileNames[1:]:
with open(os.path.join(path, file), 'r') as in_:
out.write(''.join(in_.readlines()[1:]))

Saving multiple files using template and for loop Python

Inside my input dir I have three reports FinalReport_Nm, FinalReport_S01, FinalReport_S02 etc. I will be adding about 50 more reports to this so the naming will continue on with S03, S04, T01, T02 etc. What I want this script to do is loop through the folder of reports, take FinalReport_NM, and paste it into my template, and then save this as SecondaryReport_1a_NM, and then loop back through and copy FinalReport_S01, paste it to the template, and save as SecondaryReport_1a_S01 etc.
I though by creating schedules NM S01 S02 as seen below in the script and trying to concatenate at the bottom where it says output_file would work but this is a huge fail. How can I get this script to work where it will rename the files as it loops through them.
import openpyxl as xl;
import os
input_dir = 'C:\\Python\\Reports'
output_dir = 'C:\\Reports\\output'
template = 'C:\\Python\\Report_Template.xlsx'
NewFileName = 'SecondaryReport_1a_'
schedule_index = 0
schedules=['Nm', 'S01', 'S02']
files = [file for file in os.listdir(input_dir)
if os.path.isfile(file) and file.endswith('.xlsx')]
for file in files:
input_file = os.path.join(input_dir, file)
wb=xl.load_workbook(input_file)
ws=wb.worksheets[1]
# Open template
wb2 = xl.load_workbook(template)
ws2 = wb2.worksheets[2]
# calculate total number of rows and
# columns in source excel file
mr = ws.max_row
mc = ws.max_column
# copying the cell values from source
# excel file to destination excel file
for i in range (1, mr + 1):
for j in range (1, mc + 1):
# reading cell value from source excel file
c = ws.cell(row = i, column = j)
# Cells for source data to pasted inside Template
ws2.cell(row = i+12, column = j+1).value = c.value
# saving the destination excel file
output_file = (output_dir, f"{summaryFile}_{schedules[schedule_index]}")
schedule_index += 1
wb2.save(output_file)
As I understand the question, you have several xlsx files that have file names of the format "FinalReport_(suffix).xlsx", where suffix can be "Nm", "S01", "S01", "T01", etc. For each one, you want to create a new file with a name like "SecondaryReport_1a_(suffix).xlsx", where the suffix is the same.
For this simple case, the suffix can be extracted using string slicing:
prefix_length = len("FinalReport_")
suffix = file[:prefix_length]
The output filename can be created like so:
output_file = os.path.join(output_dir, f"SecondaryReport_1a_{suffix}.xlsx")

delete rows by date and add file name column for multiple csv

I have multiple "," delimited csv files with recorded water pipe pressure sensor data, already sorted by date older-newer. For all original files, the first column always contains dates formated as YYYYMMDD. I have looked at similar discussion threads but couldn't find what I need.
Python script to add a new column to every csv file in the directory, where each row of the new column titled as "Pipe" would have a file name, omitting file extension string.
Have the option of specifying a cut off date as YYYYMMDD in order to delete rows in the orginal input file. For example, if some file has dates 20140101 to 20140630, I would like cut out rows of data if their date is < 20140401.
Have the option of either to overwrite the original files after having made these modifications or save each file to a different directory, with file names same as the originals.
Input: PipeRed.csv; Headers: Date,Pressure1,Pressure2,Temperature1,Temperature2 etc,
Output: PipeRed.csv; Headers: Pipe,Date,Pressure1,Pressure2,Temperature1, Temperature2,etc,
I have found some code and modified it a little, but it doesn't delete rows like was described above and adds the file name column last rather than 1st.
import csv
import sys
import glob
import re
for filename in glob.glob(sys.argv[1]):
#def process_file(filename):
# Read the contents of the file into a list of lines.
f = open(filename, 'r')
contents = f.readlines()
f.close()
# Use a CSV reader to parse the contents.
reader = csv.reader(contents)
# Open the output and create a CSV writer for it.
f = open(filename, 'wb')
writer = csv.writer(f)
# Process the header.
writer = csv.writer(f)
writer.writerow( ('Date','Pressure1','Pressure2','Pressure3','Pressure4','Pipe') )
header = reader.next()
header.append(filename.replace('.csv',""))
writer.writerow(header)
# Process each row of the body.
for row in reader:
row.append(filename.replace('.csv',""))
writer.writerow(row)
# Close the file and we're done.
f.close()
This function should be very close to what you want. I've tested it in both Python 2.7.9 and 3.4.2. The initial version I posted had some problems because — as I mention then — it was untested. I'm not sure if you're using Python 2 or 3, but this worked properly in either one.
Another change from the previous version is that the optional keyword date argument's name had been changed from cutoff_date to start_date to better reflect what it is. A cutoff date usually means the last date on which it is possible to do something—the opposite of the way you used it in your question. Also note that any date provided should a string, i.e. start_date='20140401', not as an integer.
One enhancement is that it will now create the output directory if one is specified but doesn't already exist.
import csv
import os
import sys
def open_csv(filename, mode='r'):
""" Open a csv file in proper mode depending on Python verion. """
return (open(filename, mode=mode+'b') if sys.version_info[0] == 2 else
open(filename, mode=mode, newline=''))
def process_file(filename, start_date=None, new_dir=None):
# Read the entire contents of the file into memory skipping rows before
# any start_date given (assuming row[0] is a date column).
with open_csv(filename, 'r') as f:
reader = csv.reader(f)
header = next(reader) # Save first row.
contents = [row for row in reader if start_date and row[0] >= start_date
or not start_date]
# Create different output file path if new_dir was specified.
basename = os.path.basename(filename) # Remove dir name from filename.
output_filename = os.path.join(new_dir, basename) if new_dir else filename
if new_dir and not os.path.isdir(new_dir): # Create directory if necessary.
os.makedirs(new_dir)
# Open the output file and create a CSV writer for it.
with open_csv(output_filename, 'w') as f:
writer = csv.writer(f)
# Add name of new column to header.
header = ['Pipe'] + header # Prepend new column name.
writer.writerow(header)
# Data for new column is the base filename without extension.
new_column = [os.path.splitext( os.path.split(basename)[1] )[0]]
# Process each row of the body by prepending data for new column to it.
writer.writerows((new_column+row for row in contents))

How do I extract data from multiple text files to Excel using Python? (One file's data per sheet)

So far for my code to read from text files and export to Excel I have:
import glob
data = {}
for infile in glob.glob("*.txt"):
with open(infile) as inf:
data[infile] = [l[:-1] for l in inf]
with open("summary.xls", "w") as outf:
outf.write("\t".join(data.keys()) + "\n")
for sublst in zip(*data.values()):
outf.write("\t".join(sublst) + "\n")
The goal with this was to reach all of the text files in a specific folder.
However, when I run it, Excel gives me an error saying,
"File cannot be opened because: Invalid at the top level of the document. Line 1, Position 1. outputgooderr.txt outputbaderr.txt. fixed_inv.txt
Note: outputgooderr.txt, outputbaderr.txt.,fixed_inv.txt are the names of the text files I wish to export to Excel, one file per sheet.
When I only have one file for the program to read, it is able to extract the data. Unfortunately, this is not what I would like since I have multiple files.
Please let me know of any ways I can combat this. I am very much so a beginner in programming in general and would appreciate any advice! Thank you.
If you're not opposed to having the outputted excel file as a .xlsx rather than .xls, I'd recommend making use of some of the features of Pandas. In particular pandas.read_csv() and DataFrame.to_excel()
I've provided a fully reproducible example of how you might go about doing this. Please note that I create 2 .txt files in the first 3 lines for the test.
import pandas as pd
import numpy as np
import glob
# Creating a dataframe and saving as test_1.txt/test_2.txt in current directory
# feel free to remove the next 3 lines if yo want to test in your directory
df = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
df.to_csv('test_1.txt', index=False)
df.to_csv('test_2.txt', index=False)
txt_list = [] # empty list
sheet_list = [] # empty list
# a for loop through filenames matching a specified pattern (.txt) in the current directory
for infile in glob.glob("*.txt"):
outfile = infile.replace('.txt', '') #removing '.txt' for excel sheet names
sheet_list.append(outfile) #appending for excel sheet name to sheet_list
txt_list.append(infile) #appending for '...txt' to txtt_list
writer = pd.ExcelWriter('summary.xlsx', engine='xlsxwriter')
# a for loop through all elements in txt_list
for i in range(0, len(txt_list)):
df = pd.read_csv('%s' % (txt_list[i])) #reading element from txt_list at index = i
df.to_excel(writer, sheet_name='%s' % (sheet_list[i]), index=False) #reading element from sheet_list at index = i
writer.save()
Output example:

Categories

Resources