transform Text file to csv with new columns

transform Text file to csv with new columns - python

I have a text file with a list of names and numbers.
Example of text file format:
james 500
Katrina 200
kyle 600 etc
I want to create a csv file from this text file with 2 columns, Names and count where name holds names and count hold the numbers. Below is what ive tried so far,
import csv
class csvTest(object):
def __init__(self):
self.convertToCSV()
def convertToCSV(self):
names = []
with open('BoysNames.txt', 'r') as b_names, open('popular_names.csv', 'w') as out_file, open('GirlsNames.txt', 'r') as g_names:
for b_lines in b_names:
b_lines = b_lines.strip().split('\t')
names.append(b_lines)
#for g_lines in g_names:
# g_lines = g_lines.split('\t')
# names.append(g_lines)
writer = csv.writer(out_file)
writer.writerow(('FirstName', 'Count'))
writer.writerows(names)
if __name__ == '__main__':
csvTest()
I'm not able to split the columns properly it all goes into names. Please help.

How about this solution with pandas. First let's create some sample data:
import io
# first let's recreate your file
data1 = '''\
james 500
kyle 600'''
data2 ='''\
Katrina 200'''
file1 = io.StringIO(data1)
file2 = io.StringIO(data2)
Now we do the real operation:
import pandas as pd
#Let's put them in a list
#This list should in reality be changed to ["path/to/file1", path/to/file2 ...
files = [file1,file2]
# now let's read this data with pandas to a dataframe
names = ["FirstName","Count"]
df = pd.concat(pd.read_csv(f, sep=" ", header=None, names=names) for f in files)
# and output to csv:
df.to_csv("output.csv", sep=",", index=False)
Result 'output.csv':
FirstName,Count
james,500
kyle,600
Katrina,200

Related

Split CSV into multiple files based on column value

I have a poorly-structured CSV file named file.csv, and I want to split it up into multiple CSV using Python.
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
The new files need to be separated based on everything between the Family rows, so for example:
file1.csv
|A|B|C|
|Continent||1|
|Family|44950|file1|
|Species|44950|12|
|Habitat||4|
|Species|44950|22|
|Condition|Tue Jan 24 00:00:00 UTC 2023|4|
file2.csv
|A|B|C|
|Continent||1|
|Family|Fish|file2|
|Species|Bass|8|
|Species|Trout|2|
|Habitat|River|3|
What's the best way of achieving this when the number of rows between appearances of Species is not consistent?

If your file really looks like that ;) then you could use groupby from the standard library module itertools:
from itertools import groupby
def key(line): return line.startswith("|Family|")
family_line, file_no = None, 0
with open("file.csv", "r") as fin:
for is_family_line, lines in groupby(fin, key=key):
if is_family_line:
family_line = list(lines).pop()
elif family_line is None:
header = "".join(lines)
else:
file_no += 1
with open(f"file{file_no}.csv", "w") as fout:
fout.write(header + family_line)
for line in lines:
fout.write(line)
A Pandas solution would be:
import pandas as pd
df = pd.read_csv("file.csv", header=None, delimiter="|").fillna("")
blocks = df.iloc[:, 1].eq("Family").cumsum()
header_df = df[blocks.eq(0)]
for no, sdf in df.groupby(blocks):
if no > 0:
sdf = pd.concat([header_df, sdf])
sdf.to_csv(f"file{no}.csv", index=False, header=False, sep="|")

import pandas as pd
pd.read_csv('file.csv',delimiter='|')
groups = df.groupby('Family')
for name, group in groups:
    group.to_csv(name + '.csv', index=False)

Here is a pure python working method:
# Read file
with open('file.csv', 'r') as file:
text = file.read()
# Split using |Family|
splitted_text = text.split("|Family|")
# Remove unwanted content before first |Family|
splitted_text = splitted_text[1:]
# Add |Family| back to each part
splitted_text = ['|Family|' + item for item in splitted_text]
# Write files
for i, content in enumerate(splitted_text ):
with open('file{}.csv'.format(i), 'w') as file:
file.write(content)

How to manipulate csv entries from horizontal to vertical

I have a csv with the following entries:
apple,orange,bannana,grape
10,5,6,4
four,seven,eight,nine
yes,yes,no,yes
3,5,7,4
two,one,six,nine
no,no,no,yes
2,4,7,8
yellow,four,eight,one
no,yes,no,no
I would like to make a new csv file with the following format and so on:
apple,10,four,yes
orange,5,seven,yes
bannana,6,seven,no
grape,4,nine,yes
apple,3,two,no
orange,5,one,no
bannana,7,six,no
grape,4,nine,yes
So after grape it starts at apple with the new values.
I have tried using pandas DataFrames but cant figure how to get the data formatted how I need it.

You could try the following in pure Python (data.csv name of input file):
import csv
from itertools import islice
with open("data.csv", "r") as fin,\
open("data_new.csv", "w") as fout:
reader, writer = csv.reader(fin), csv.writer(fout)
header = next(reader)
length = len(header) - 1
while (rows := list(islice(reader, length))):
writer.writerows([first, *rest] for first, rest in zip(header, zip(*rows)))
Or with Pandas:
import pandas as pd
df = pd.read_csv("data.csv")
df = pd.concat(gdf.T for _, gdf in df.set_index(df.index % 3).groupby(df.index // 3))
df.reset_index().to_csv("data_new.csv", index=False, header=False)
Output file data_new.csv for the provided sample:
apple,10,four,yes
orange,5,seven,yes
bannana,6,eight,no
grape,4,nine,yes
apple,3,two,no
orange,5,one,no
bannana,7,six,no
grape,4,nine,yes
apple,2,yellow,no
orange,4,four,yes
bannana,7,eight,no
grape,8,one,no

Hope it works for you.
df = pd.read_csv('<source file name>')
df.T.to_csv('<destination file name>')

You can transpose your dataframe in pandas as below.
pd.read_csv('file.csv', index_col=0, header=None).T
this question is already answered:
Can pandas read a transposed CSV?
According to your new description, the problem is completely changed.
you need to split your dataframe to subsets and merge them.
# Read dataframe without header
df = pd.read_csv('your_dataframe.csv', header=None)
# Create an empty DataFrame to store transposed data
tr = pd.DataFrame()
# Create, transpose and append subsets to new DataFrame
for i in range(1,df.shape[0],3):
... temp = pd.DataFrame()
... temp = temp.append(df.iloc[0])
... temp = temp.append(df.iloc[i:i+3])
... temp = temp.transpose()
... temp.columns = [0,1,2,3]
... tr = d.append(temp)

Create csv file using python, where all the values are seperated after first spacing and creates one column

I need help to convert simple_line.txt file to csv file using the pandas library. However, I am unable to categorize image file where i want to create all the values after first space in one column.
Here is the file (sample_list.txt), listed row by row:
Image Label
doc_pres223.jpg Durasal
doc_pres224.jpg Tab Cefepime
doc_pres225.jpg Tab Bleomycin
doc_pres226.jpg Budesonide is a corticosteroid,
doc_pres227.jpg prescribed for inflammatory,
I want the csv file to be like-
enter image description here

txt_file = r"./example.txt"
csv_file = r"./example.csv"
separator = "; "
with open(txt_file) as f_in, open(csv_file, "w+") as f_out:
for line in f_in:
f_out.write(separator.join(line.split(" ", maxsplit=1)))

try this:
import pandas as pd
def write_file(filename, output):
df = pd.DataFrame()
lines = open(filename, 'r').readlines()
for l in range(1, len(lines)):
line = lines[l]
arr = line.split(" ", maxsplit=1)
image_line = arr[0]
label_line = arr[1].replace('\n', '')
df = df.append({'Image': image_line, 'Label': label_line}, ignore_index=True)
df.to_csv(output)
if __name__ == '__main__':
write_file('example.txt', 'example.csv')

If the filenames in column Image is always the same length, then you could just treat is as a fixed width file. So the first column would be 15 characters, and the rest is the second column. Then just add two empty columns and write it to a new file.
# libraries
import pandas as pd
# set filename
filename = "simple_line.txt"
# read as fixed width
df = pd.read_fwf(filename, header=0, widths=[15, 100])
# add 2 empty columns
df.insert(1, 'empty1', '')
df.insert(2, 'empty2', '')
# save as a new csv file
filenew = "output.csv"
df.to_csv(filenew, sep=';', header=True, index=False)

Import data from text file with multiple conditions using Pandas

I'm trying to parse this text file using Pandas data frame.
The text file is in this particular format:
Name: Tom
Gender: Male
Books:
The problem of Pain
The reason for God: belief in an age of skepticism
My code so far to import the data is:
import pandas as pd
df = pd.read_table(filename, sep=":|\n", engine='python', index_col=0)
print df
The output I got is:
Name Tom
Gender Male
Books NaN
The problem of Pain NaN
The reason for God belief in an age of skepticism
How should I change the code such that the output I get will be: (edited output)
Name Gender Books
Tom Male The problem of Pain, The reason for God: belief in an age of skepticism
Thanks for helping!

You can do two things. You can use enumerate(), and use an if statement:, I used a text file named test.txt in the below code.
import pandas as pd
d = {}
value_list = []
for index, text in enumerate(open('test.txt', "r")):
if index < 2:
d[text.split(':')[0]] = text.split(':')[1].rstrip('\n')
elif index ==2:
value = text.split(':')[0]
else:
value_list.append(text.rstrip('\n'))
d[value] = [value_list]
df = pd.DataFrame.(d)
Instead you can use readlines() and then slice through each line to get and populate the dictionary and then create a dataframe.
import pandas as pd:
text_file = open('test.txt', "r")
lines = text_file.readlines()
d = {}
d[lines[0:1][0].split(':')[0]] = lines[0:1][0].split(':')[1].rstrip('\n')
d[lines[1:2][0].split(':')[0]] = lines[1:2][0].split(':')[1].rstrip('\n')
d[lines[2:3][0].split(':')[0]] = [lines[3:]]
df = pd.DataFrame(d)

The method I use is simple: regex.
import os, re
import pandas as pd
# List out the all files in dir that ends with .txt
files = [file for file in os.listdir(PROFILES) if file.endswith(".txt")]
HEADERS = ['Name', 'Gender', 'Books']
DATA = [] # create the empty list to store profiles
for file in files: # iterate over each file
filename = PROFILES + file # full path name of the data files
text_file = open(filename, "r") # open the file
lines = text_file.read() # read the file in memory
text_file.close() # close the file
###############################################################
# Regex to filter out all the column header and row data. ####
# Odd Number == Header, Even Number == Data ##################
###############################################################
books = re."(Name):(.*)\n+(Gender):(.*)\n+(Books):((?<=Books:)\D+)",lines)
# append data into DATA list
DATA.append([books.group(i).strip() for i in range(len(books.groups()) + 1) if not i % 2 and i != 0])
profilesDF = pd.DataFrame(DATA, columns=HEADERS) # create the dataframe

Python: extracting data values from one file with IDs from a second file

I’m new to coding, and trying to extract a subset of data from a large file.
File_1 contains the data in two columns: ID and Values.
File_2 contains a large list of IDs, some of which may be present in File_1 while others will not be present.
If an ID from File_2 is present in File_1, I would like to extract those values and write the ID and value to a new file, but I’m not sure how to do this. Here is an example of the files:
File_1: data.csv
ID Values
HOT224_1_0025m_c100047_1 16
HOT224_1_0025m_c10004_1 3
HOT224_1_0025m_c100061_1 1
HOT224_1_0025m_c10010_2 1
HOT224_1_0025m_c10020_1 1
File_2: ID.xlsx
IDs
HOT224_1_0025m_c100047_1
HOT224_1_0025m_c100061_1
HOT225_1_0025m_c100547_1
HOT225_1_0025m_c100561_1
I tried the following:
import pandas as pd
data_file = pd.read_csv('data.csv', index_col = 0)
ID_file = pd.read_excel('ID.xlsx')
values_from_ID = data_file.loc[['ID_file']]
The following error occurs:
KeyError: "None of [['ID_file']] are in the [index]"
Not sure if I am reading in the excel file correctly.
I also do not know how to write the extracted data to a new file once I get the code to do it.
Thanks for your help.

With pandas:
import pandas as pd
data_file = pd.read_csv('data.csv', index_col=0, delim_whitespace=True)
ID_file = pd.read_excel('ID.xlsx', index_col=0)
res = data_file.loc[ID_file.index].dropna()
res.to_csv('result.csv')
Content of result.csv:
IDs,Values
HOT224_1_0025m_c100047_1,16.0
HOT224_1_0025m_c100061_1,1.0
In steps:
You need to read your csv with whitespace delimited:
data_file = pd.read_csv('data.csv', index_col=0, delim_whitespace=True)
it looks like this:
>>> data_file
Values
ID
HOT224_1_0025m_c100047_1 16
HOT224_1_0025m_c10004_1 3
HOT224_1_0025m_c100061_1 1
HOT224_1_0025m_c10010_2 1
HOT224_1_0025m_c10020_1 1
Now, read your Excel file, using the ids as index:
ID_file = pd.read_excel('ID.xlsx', index_col=0)
and you use its index with locto get the matching entries from your first dataframe. Drop the missing values with dropna():
res = data_file.loc[ID_file.index].dropna()
Finally, write to the result csv:
res.to_csv('result.csv')

You can do it using a simple dictionary in Python. You can make a dictionary from file 1 and read the IDs from File 2. The IDS from file 2 can be checked in the dictionary and only the matching ones can be written to your output file. Something like this could work :
with open('data.csv','r') as f:
lines = f.readlines()
#Skip the CSV Header
lines = lines[1:]
table = {l.split()[0]:l.split()[1] for l in lines if len(l.strip()) != 0}
with open('id.csv','r') as f:
lines = f.readlines()
#Skip the CSV Header
lines = lines[1:]
matchedIDs = [(l.strip(),table[l.strip()]) for l in line if l.strip() in table]
Now you will have your matched IDs and their values in a list of tuples called matchedIDs. You can write them in any format you like in a file.

I'm also new to python programming. So the code that I used below might not be the most efficient. The situation I assumed is that find ids in data.csv also in id.csv, there might be some ids in data.csv not in id.csv and vise versa.
import pandas as pd
data = pd.read_csv('data.csv')
id2 = pd.read_csv('id.csv')
data.ID = data['ID']
id2.ID = idd['IDs']
d=[]
for row in data.ID:
d.append(row)
f=[]
for row in id2.ID:
f.append(row)
g=[]
for i in d:
if i in f:
g.append(i)
data = pd.read_csv('data.csv',index_col='ID')
new_data = data.loc[g,:]
new_data.to_csv('new_data.csv')

This is the code I ended up using. It worked perfectly. Thanks to everyone for their responses.
import pandas as pd
data_file = pd.read_csv('data.csv', index_col=0)
ID_file = pd.read_excel('ID.xlsx', index_col=0)
res = data_file.loc[ID_file.index].dropna()
res.to_csv('result.csv')

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

transform Text file to csv with new columns - python

Related

Split CSV into multiple files based on column value

How to manipulate csv entries from horizontal to vertical

Create csv file using python, where all the values are seperated after first spacing and creates one column

Import data from text file with multiple conditions using Pandas

Python: extracting data values from one file with IDs from a second file

Categories

Resources