Steps:
Read multiple .html files in the directory
extract the titles of the html
Need:
- sending the titles into individual .txt files
Expected: Any advise. Ideally I wanted to extract integers from the html files name ('23434.html') and name the text files as '23434.txt'
Results:
- there is no txt file created in the designated path.
- Nothing gets written
for file_name in glob.glob(os.path.join(dir_path, "*.html")):
with open(file_name) as html_file:
soup=BeautifulSoup(html_file)
d=soup.title.get_text()
#resultfile=re.findall('\d+', file_name)
with open("m"+".txt", "w") as outfile:
outfile.write(d)
outfile.close
for fpath in glob.glob(os.path.join(dir_path, "*.html")):
with open(fpath) as html_file:
soup = BeautifulSoup(html_file)
html_title = soup.title.get_text()
html_number = os.path.basename(fpath).rsplit('.',1)[0]
with open(html_number + '.txt', 'w') as outfile:
outfile.write(html_title)
Related
I am working on merging a number of text files together into a single text document. I am able to read all the file names and create a new output document.
However, when I output the document, I am only getting the data from one file and not the rest? Overall it should be close to 1 million lines in a txt, but only getting the first 10k
import os
projpath1 = 'PATH1'
projpath2 = 'PATH2'
for root, dirs, files in os.walk(f"{projpath1}", topdown=False):
for name in files:
if not name.startswith('.DS_Store'):
split = name.split("/")
title = split[0]
filename = (os.path.join(root, name))
inputf = os.path.expanduser(f'{projpath1}/{title}')
updatedf = os.path.expanduser(f'{projpath2}/ENC_merged.txt')
with open(inputf, "r") as text_file, open(updatedf, 'w') as outfile:
for info in text_file:
for lines in info:
outfile.write(lines)
I really am stuck and can't figure it out :/
You are suppose to open create output file first and within it you need to save all the input files, something like this should work for you.
import os
projpath1 = 'PATH1'
projpath2 = 'PATH2'
with open(updatedf, 'w') as outfile:
for root, dirs, files in os.walk(f"{projpath1}", topdown=False):
for name in files:
if not name.startswith('.DS_Store'):
split = name.split("/")
title = split[0]
filename = (os.path.join(root, name))
inputf = os.path.expanduser(f'{projpath1}/{title}')
updatedf = os.path.expanduser(f'{projpath2}/ENC_merged.txt')
with open(inputf, "r") as text_file:
for info in text_file:
for lines in info:
outfile.write(lines)
What about doing it with bash
ls | xargs cat > merged_file
With Python I'm attempting to edit a series of text files to insert a series of strings. I can do so successfully with a single txt file. Here's my working code that appends messages before and after the main body within the txt file:
filenames = ['text_0.txt']
with open("text_0.txt", "w") as outfile:
for filename in filenames:
with open(filename) as infile:
header1 = "Message 1:"
lines = "\n\n\n\n"
header2 = "Message 2:"
contents = header1 + infile.read() + lines + header2
outfile.write(contents)
I'm seeking some assistance in structuring a script to iteratively make the same edits to a series of similar txt files in the directory. There are 20 or similar txt files are structured the same: text_0.txt, text_1.txt, text_2.txt, and so on. Any assistance is greatly appreciated.
to loop through a folder of text files, you need to do it like this:
import os
YOURDIRECTORY = "TextFilesAreHere" ##this is the folder where there's your text files
for file in os.listdir(YOURDIRECTORY):
filename = os.fsdecode(file)
with open(YOURDIRECTORY + "/" + filename, "r"):
###do what you want with the file
If you already know the file naming then you can simply loop:
filenames = [f'text_{index}.txt' for index in range(21)]
for file_name in filenames:
with open(file_name, "w") as outfile:
for filename in filenames:
with open(filename) as infile:
header1 = "Message 1:"
lines = "\n\n\n\n"
header2 = "Message 2:"
contents = header1 + infile.read() + lines + header2
outfile.write(contents)
Or loop the directory like:
import os
for filename in os.listdir(directory):
#do something , like check the filename in list
At first, i tried to open just a file named 'index.html', parse it and saved it as a csv file. This was the code and it worked well. enter image description here
with open('/Users/kwon/Downloads/cnn/index.html') as html_file:
soup = BeautifulSoup(html_file, 'html.parser')
cnn_file = open('cnn2.csv', 'w')
cnn_writer = csv.writer(cnn_file)
cnn_writer.writerow(['filename','date','headline','text'])
filename = 'index000'
print(filename)
date = soup.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = soup.title.text
headline = headline.split('-')[0]
print(headline)
txt = soup.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-r'\d*'-containers").text
print(txt)
cnn_writer.writerow([filename, date, headline, txt])
cnn_file.close()
But i want to iterate the same process for all html files(index.html~index591.html) in a directory folder. So i started by using glob module to open files sequentially. Then, tried 'for loop' to parse as i did before. Somehow i don't know how to read and parse them sequentially and name filename as 'index000' to 'index591'. Also if i run the code below i get the error saying 'find() takes no keyword arguments'.
import glob
path = '/Users/kwon-yejin/Downloads/cnn2/*.html'
files=glob.glob(path)
for file in files:
html = open(file, 'r')
soup = bs4.BeautifulSoup(html, 'html.parser')
for line in soup:
filename = 'index000'
print(filename)
date = line.find(class_='update-time').text
date = date.split(' ')[5]+' '+date.split(' ')[6]+' '+date.split(' ')[7]
print(date)
headline = line.title.text
headline = headline.split('-')[0]
print(headline)
txt = line.find(class_="zn zn-body-text zn-body zn--idx-0 zn--ordinary zn-has-multiple-containers zn-has-21-containers").text
print(txt)
Sread and parse them sequentially filename as 'index000' to 'index591'
path = '/Users/kwon-yejin/Downloads/cnn2/'
for i in range(592):
file = path+'index'+str(i).zfill(3)+'.html'
print(file)
/Users/kwon-yejin/Downloads/cnn2/index000.html
/Users/kwon-yejin/Downloads/cnn2/index001.html
/Users/kwon-yejin/Downloads/cnn2/index002.html
/Users/kwon-yejin/Downloads/cnn2/index003.html
..................
/Users/kwon-yejin/Downloads/cnn2/index589.html
/Users/kwon-yejin/Downloads/cnn2/index590.html
/Users/kwon-yejin/Downloads/cnn2/index591.html
I have a set of text documents (basically they are emails saved as text files.) I have to read these and write in a CSV or Pandas data frame. Each row should take one email/text file.
I am new to Python. I don't have an idea of how to proceed with this problem. Please help.
Filename Content
email1 Content of email 1
email2 Content of email 2
email3 Content of email 3
… …
… …
… …
email n Content of email 7
Edit
I was using the below code
dirpath = 'path'
output = 'output_file.csv'
with open(output, 'w') as outfile:
csvout = csv.writer(outfile)
csvout.writerow(['FileName', 'Content'])
files = os.listdir(dirpath)
for filename in files:
with open(dirpath + '/' + filename) as afile:
csvout.writerow([filename, afile.read()])
afile.close()
outfile.close()
You can start to work from here:
import csv #is the library
with open('example.csv', 'w') as csvfile: #to create a new csv
fieldnames = ['text']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) #is the name of column
while length > 0:
writer.writerow({'email': email}) # write a row
length-=1
p.s.
this work with python 3.6, good work
The answer provided here worked:
Combine a folder of text files into a CSV with each content in a cell
import os
os.chdir('file path')
from pathlib import Path
with open('big.csv', 'w') as out_file:
csv_out = csv.writer(out_file)
csv_out.writerow(['FileName', 'Content'])
for fileName in Path('.').glob('*.txt'):
csv_out.writerow([str(fileName),open(str(fileName.absolute())).read().strip()])
I have 2 or more than .txt file contains
file1.txt
India
File2.txt
US
I wanted to write output in third file as India US.
Please any one can tell me how to do it using python.
import glob
all_text_files = glob.glob('/path/to/dir', '*.txt')
with open('output_file.txt', 'w') as fh:
for text_file in all_text_files:
data = open(text_file, 'r')
fh.write(data.read())
glob.glob('*.txt') returns ALL the .txt files in the current directory.
If you want to read only a few files, you can specify them in a list
all_text_files = ['file1.txt', 'file2.txt', ....., 'filen.txt']
source_files = ['file1.txt', 'file2.txt']
with open('output.txt', 'w') as fh_out:
for fname in source_files:
with open(fname, 'r') as fh:
fh_out.write(fh.read())
files = ['file1.txt','file2.txt']
for file in files:
with open(file,'r') as file_read:
with open('file3.txt', 'w+') as file_put:
file_put.write(file_read.read())