merge multiple files into a single folder - python

I have corpus named ZebRa consisting of 7 folders, each having 10 files inside. I want to merge the 10 files inside each folder, in order to have finally only 7 folders. Here is what I have tried:
import os
def CombineFiles(file_path):
with open(file_path, 'r', encoding="utf-8") as f:
OutFile = open('D:/1.txt', 'w', encoding="utf-8")
lines = f.read().splitlines()
for i in range(len(lines)):
lines[i] = lines[i].replace('\n', '')
lines.append('\n')
for i in range(len(lines)):
OutFile.write(lines[i])
return OutFile
for root, dirs, files in os.walk("C:/ZebRa", topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
CombineFiles(file_path)
However, it seems that each time it empties the content of OutFile and the stored output is only the content of the last file in the last folder
I have also tried the following, however, the output will be an empty file:
import os
for root, dirs, files in os.walk("C:/ZebRa", topdown= False):
print(files)
with open('D:/1.txt', 'w', encoding="utf-8") as OutFile:
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
OutFile.write(f.read())

Change open('D:/1.txt', 'w', encoding="utf-8") to open('D:/1.txt', 'a', encoding="utf-8"). a flag is used to append new data to end of the file, while the w flag always rewrite the file. See this tutorial.

Related

How to delete a string from all files in a directory

I want all files in directory "path" to have the string "error" removed from them and the result to be saved in the same file that was editted. My current code (below) ends up clearing up the entire file, rather than just removing the string and keeping everything else the same.
import os
path = "path"
files = os.listdir(path)
error = "string"
for index, file in enumerate(files):
with open(os.path.join(path, file)) as fin, open(os.path.join(path, file), "w+") as fout:
for line in fin:
line = line.replace(error, "f")
fout.write(line)
import os
path = "path"
files = os.listdir(path)
error = "string"
for index, file in enumerate(files):
with open(os.path.join(path, file), 'r') as fin:
d = din.read()
with open(os.path.join(path, file), "w") as fout:
d = d.replace(error, "")
fout.write(d)
This is the correct way to do this:
import os
path = "path"
for file in os.listdir(path):
if not os.path.isdir(file):
with open(file, 'r+') as fd:
contents = fd.read().replace('error', '')
fd.seek(0)
fd.write(contents)
fd.truncate()

Loop through files in a folder and create a new merged text file

I am working on merging a number of text files together into a single text document. I am able to read all the file names and create a new output document.
However, when I output the document, I am only getting the data from one file and not the rest? Overall it should be close to 1 million lines in a txt, but only getting the first 10k
import os
projpath1 = 'PATH1'
projpath2 = 'PATH2'
for root, dirs, files in os.walk(f"{projpath1}", topdown=False):
for name in files:
if not name.startswith('.DS_Store'):
split = name.split("/")
title = split[0]
filename = (os.path.join(root, name))
inputf = os.path.expanduser(f'{projpath1}/{title}')
updatedf = os.path.expanduser(f'{projpath2}/ENC_merged.txt')
with open(inputf, "r") as text_file, open(updatedf, 'w') as outfile:
for info in text_file:
for lines in info:
outfile.write(lines)
I really am stuck and can't figure it out :/
You are suppose to open create output file first and within it you need to save all the input files, something like this should work for you.
import os
projpath1 = 'PATH1'
projpath2 = 'PATH2'
with open(updatedf, 'w') as outfile:
for root, dirs, files in os.walk(f"{projpath1}", topdown=False):
for name in files:
if not name.startswith('.DS_Store'):
split = name.split("/")
title = split[0]
filename = (os.path.join(root, name))
inputf = os.path.expanduser(f'{projpath1}/{title}')
updatedf = os.path.expanduser(f'{projpath2}/ENC_merged.txt')
with open(inputf, "r") as text_file:
for info in text_file:
for lines in info:
outfile.write(lines)
What about doing it with bash
ls | xargs cat > merged_file

Merge csv files, add original file name to each row in output file

I have multiple csv files in a folder with the same data structure,
0.00;1.05;10.5
0.01;2.05;15.5
0.02;3.05;20.5
...
I want ot merge all the csv files to 1 summary file and add a column with file name to each line depanding on the original data source.
0.00;1.05;10.5;csv1.csv
0.01;2.05;15.5;csv1.csv
0.02;3.05;20.5;csv1.csv
0.00;5.05;0.05;csv2.csv
0.01;6.05;1.05;csv2.csv
0.02;7.05;2.05;csv2.csv
...
I managed to merge the files, but cant find a way to add the file names.
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw)
for file in files:
with open(file, newline='') as f:
cr = csv.reader(islice(f,13,None)
cw.writerows(cr)
I dont want to use pandas concat due to ram limitations.
Thanks you.
You don't need to parse the input csv files, just append a delimiter and then the current file name to each line. You can use the fileinput module:
import fileinput
from pathlib import Path
folder = '.' # set accordingly, assume current directory
path = Path(folder)
with fileinput.input(files=path.glob('*.csv')) as f, open('results.csv', 'w') as outfile:
for line in f:
print(';'.join([line.rstrip('\n'), fileinput.filename().name()]), file=outfile)
Regarding your code, you can fix it like this:
import os
import csv
folder = '.'
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw, delimiter=';')
for file in files:
with open(file, newline='') as f:
for row in csv.reader(f, delimiter=';'):
row.append(file)
cw.writerow(row)
Here the delimiter argument is set to semi-colon because the default delimiter is comma and your files are using ;. That will fix the proper parsing of the input csv files, and use ; for the output file. Then each input file is processed by reading each line and appending the filename to the row list. Finally the new row is written to the output CSV file.
You can use os and pandas:
import os
import pandas as pd
basedir = <path of your base folder>
all_dfs = []
for filename in filter(lambda f: os.path.splitext(f)[1] == '.csv', next(os.walk(basedir))[2]):
curr_df = pd.read_csv(os.path.join(basedir, filename), sep=';', header=None)
curr_df['filename'] = filename
all_dfs.append(curr_df)
pd.concat(all_dfs, axis=0).to_csv('merged_cvs.csv', sep=';', header=False, index=False)
Or if you prefer in only one line:
pd.concat([pd.concat((df, pd.DataFrame([f for _ in range(len(df))])), axis=1) for f, df in
((filename, pd.read_csv(os.path.join(basedir, filename), sep=';', header=None))
for filename in filter(lambda f: os.path.splitext(f)[1] == '.csv', next(os.walk(basedir))[2]))
]).to_csv('merged_cvs.csv', sep=';', header=False, index=False)
files = []
for file in os.listdir(folder):
if file.endswith('.csv'):
files.append(file)
with open('results.csv', 'w', newline='') as fw:
cw = csv.writer(fw)
for file in files:
with open(file, newline='') as f:
fw.write(f"{file}\n") # just write the filename before the content :)
cr = csv.reader(islice(f,13,None)
cw.writerows(cr)

Copying selected lines from files in different directories to another file

I have a directory with many subdirectories, containing files. I want to open the files ending with "root.vrpj" or "root.vprj", in "App_integrations" folder and copy the lines containing the word "table" to another file.
Until now I've managed to visit each file with this code:
for root, dirs, files in os.walk(movedir):
for filename in files:
if filename.endswith(("root.vrpj", "root.vprj")):
The problem is that what I have now are just the names of the files I want to visit and I'm stuck here.
You can try this:
f = open('final_file.txt', 'w')
for root, dirs, files in os.walk(movedir):
for filename in files:
if filename.endswith("root.vrpj") or filename.endswith("root.vprj"):
with open(filename) as data:
for line in data:
if "table" in data:
f.write('{}\n'.format(data))
f.close()
This is a version of Ajax' code that closes the files you open in the loop (and fixes a couple of other minor issues):
with open('final_file.txt', 'w') as f:
for root, dirs, files in os.walk(movedir):
for filename in files:
if filename.endswith(("root.vrpj"), ("root.vprj")):
with open(os.path.join(root, filename)) as finput:
for line in finput:
if 'table' in line:
f.write(line)
however, when you see 8 levels of indentation you need to refactor, e.g.:
def find_files(startdir, *extensions):
for root, dirs, files in os.walk(movedir):
for filename in files:
if filename.endswith(extensions):
yield os.path.join(root, filename)
def find_lines(fname, text):
with open(fname) as fp:
return [line for line in fp if text in line]
with open('final_file.txt', 'w') as f:
for fname in find_files(movedir, 'root.vrpj', 'root.vprj'):
f.writelines(find_lines(fname, 'table'))
I finally solved it
import os
rootdir = my root folder
# creates a file f that contains all the lines of the files
# with "root.vrpj" or "root.vprj" in their name
# and who are inside "App_integrations" folders
# without duplicates
#creating the big file with all the file containing the lines I need
f = open('final_file.txt', 'a')
for root, dirs, files in os.walk(rootdir):
for filename in files:
if (filename.endswith(("root.vrpj", "root.vprj")) and ("App_Integration" in os.path.join(root, filename))):
full_name = os.path.join(root, filename)
data = open(full_name).read()
f.write(data + "\n")
f.close()
#copying the lines I need to f1 without duplicates
lines_seen = set()
f = open('final_file.txt')
f1 = open('testread1.txt', 'a')
doIHaveToCopyTheLine=False
for line in f.readlines():
if (("Table" in line) and (line not in lines_seen)):
doIHaveToCopyTheLine=True
if doIHaveToCopyTheLine:
f1.write(line)
lines_seen.add(line)
f1.close()
f.close()
Find the files
from pathlib import Path
import itertools
source_dir = Path(<source_dir>)
patterns = ['**/*root.vrpj', '**/*root.vprj']
files = itertools.chain.from_iterables(source_dir.glob(pat) for pat in patterns))
Filter the files:
def filter_lines(files):
for file in files:
if not 'App_Integration' in file.parts:
continue
with file.open('r') as file_handle:
for line in file_handle:
if 'table' in line:
yield line
Write the output
def save_lines(lines, output_file=sys.std_out):
for line in lines:
output_file.write(line)
with Path(<output_file>).open('w') as output_file:
save_lines(filter_lines(files), as output_file)

How to read multiple(more than 2) .txt files using python

I have 2 or more than .txt file contains
file1.txt
India
File2.txt
US
I wanted to write output in third file as India US.
Please any one can tell me how to do it using python.
import glob
all_text_files = glob.glob('/path/to/dir', '*.txt')
with open('output_file.txt', 'w') as fh:
for text_file in all_text_files:
data = open(text_file, 'r')
fh.write(data.read())
glob.glob('*.txt') returns ALL the .txt files in the current directory.
If you want to read only a few files, you can specify them in a list
all_text_files = ['file1.txt', 'file2.txt', ....., 'filen.txt']
source_files = ['file1.txt', 'file2.txt']
with open('output.txt', 'w') as fh_out:
for fname in source_files:
with open(fname, 'r') as fh:
fh_out.write(fh.read())
files = ['file1.txt','file2.txt']
for file in files:
with open(file,'r') as file_read:
with open('file3.txt', 'w+') as file_put:
file_put.write(file_read.read())

Categories

Resources