How do I create - python

I have a text file that needs to be read line by line and converted into a data frame with the 4 following columns
import re
import pandas as pd
with open('/Users/Desktop/Final Semester Fall 2022/archive/combined_data_1.txt',encoding='latin-1') as f:
for line in f:
result = re.search(r"^(\d+),(\d+),(\d{4}-\d{2}-\d{2})/gm", line)
if re.search(r"(^\d+):", line) is not None:
movie_id = re.search(r"(^\d+):", line).group(1)
elif result:
customerid = result.group(1)
rating = result.group(2)
date = result.group(3)
else:
continue
data_list = [customerid, rating, date, movie_id]
df1 = pd.DataFrame(data_list)
df1.to_csv(r'/Users/Desktop/Final Semester Fall 2022/archive/combineddata1.csv')
Im getting the following error:
How do I fix this error???
Thanks in advance!!

here is one way to do it
# read the csv file using read_csv, using ":" as a separator
# since there is only one colon ":" per movie, you end up with a row for movie following by rows for the rest of the data.
df=pd.read_csv(r'c:\csv.csv', sep=':', header=None, names=['col1', 'col2'])
# when there is no comma in a row, means its only a movie id,
# so we populate the movieid column and downfill for all rows
df['MovieId'] = df['col1'].mask(df['col1'].str.contains(',')).ffill()
# split the data into CusotmerId, rating and date
df[['CustomerID','Rating','Date']] = df['col1'].str.split(',',expand=True)
# drop the unwanted columns and rows
df2=df[df['col1'].ne(df['MovieId'])].drop(columns=['col1','col2'])
df2
# sample created from the data you shared above as image
MovieId CustomerID Rating Date
1 1 1488844 3 2005-09-06
2 1 822109 5 2005-05-13
3 1 885013 4 2005-10-19
4 1 30878 4 2005-12-26
5 1 823519 3 2004-05-03
6 1 893988 3 2005-11-17
7 1 124105 4 2004-08-05
8 1 1248629 3 2004-04-22
9 1 1842128 4 2004-05-09
10 1 2238063 3 2005-05-11
11 1 1503895 4 2005-05-19
13 2 1288844 3 2005-09-06
14 2 832109 5 2005-05-13

You can parse that structure quite easily (without regex, using a few lines of very readable vanilla Python) and build a dictionary while reading the data file. You can then convert the dictionary to a DataFrame in one go.
import pandas as pd
df = {'MovieID':[], 'CustomerID':[], 'Rating':[], 'Date':[]}
with open('data.txt', 'r') as f:
for line in f:
line = line.strip()
if line: #skip empty lines
if line.endswith(':'): #MovieID
movie_id = line[:-1]
else:
customer_id, rating, date = line.split(',')
df['MovieID'].append(movie_id)
df['CustomerID'].append(customer_id)
df['Rating'].append(rating)
df['Date'].append(date)
df = pd.DataFrame(df)
print(df)
MovieID CustomerID Rating Date
0 1 1488844 3 2005-09-06
1 1 822109 5 2005-05-13
2 1 885013 4 2005-10-19
3 1 30878 4 2005-12-26
4 2 823519 3 2004-05-03
5 2 893988 3 2005-11-17
6 2 124105 4 2004-08-05
7 2 1248629 3 2004-04-22
8 2 1842128 4 2004-05-09
9 3 2238063 3 2005-05-11
10 3 1503895 4 2005-05-19
11 3 1288844 3 2005-09-06
12 3 832109 5 2005-05-13
It hardly gets easier than this.

An error in a regular expression
You've got the NameError because of /gm in the regular expression you use to identify result.
I suppose that /gm was coppied here by mistake. In other languages this could be GLOBAL and MULTILINE match modifiers, which by the way are not needed in this case. But in the python re module they are just three character. As far as you have no line with /gm inside, your result was allways None, so the elif result: ... block was never executed and variables customerid, rating, date were not initialized.
An error in working with variables
If you remove /gm from the first matching, you'll have another problem: the variables customerid, rating, date, movie_id are just strings, so the resulting data frame will reflect only the last record of the source file.
To avoid this we have to work with them as with a list-like structure. For example, in the code below, they are keys in the data dictionary, each referring to a separate list:
file_name = ...
data = {'movie_id': [], 'customerid': [], 'rating': [], 'date': []}
with open(file_name, encoding='latin-1') as f:
for line in f:
result = re.search(r"^(\d+),(\d+),(\d{4}-\d{2}-\d{2})", line)
if re.search(r"(^\d+):", line) is not None:
movie_id = re.search(r"(^\d+):", line).group(1)
elif result:
data['movie_id'].append(movie_id)
data['customerid'].append(result.group(1))
data['rating'].append(result.group(2))
data['date'].append(result.group(3))
else:
continue
df = pd.DataFrame(data)
Code with test data
import re
import pandas as pd
data = '''\
1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26
2:
823519,3,2004-05-03
893988,3,2005-11-17
124105,4,2004-08-05
1248629,3,2004-04-22
1842128,4,2004-05-09
3:
2238063,3,2005-05-11
1503895,4,2005-05-19
1288844,3,2005-09-06
832109,5,2005-05-13
'''
file_name = "data.txt"
with open(file_name, 'tw', encoding='latin-1') as f:
f.write(data)
data = {'movie_id': [], 'customerid': [], 'rating': [], 'date': []}
with open(file_name, encoding='latin-1') as f:
for line in f:
result = re.search(r"^(\d+),(\d+),(\d{4}-\d{2}-\d{2})", line)
if re.search(r"(^\d+):", line) is not None:
movie_id = re.search(r"(^\d+):", line).group(1)
elif result:
data['movie_id'].append(movie_id)
data['customerid'].append(result.group(1))
data['rating'].append(result.group(2))
data['date'].append(result.group(3))
else:
continue
df = pd.DataFrame(data)
df.to_csv(file_name[:-3] + 'csv', index=False)
An alternative
df = pd.read_csv(file_name, names = ['customerid', 'rating', 'date'])
df.insert(0, 'movie_id', pd.NA)
isnot_movie_id = ~df['customerid'].str.endswith(':')
df['movie_id'] = df['customerid'].mask(isnot_movie_id).ffill().str[:-1]
df = df.dropna().reset_index(drop=True)

Related

How to Convert a text data into DataFrame

How i can convert the below text data into a pandas DataFrame:
(-9.83334315,-5.92063135,-7.83228037,5.55314146), (-5.53137301,-8.31010785,-3.28062536,-6.86067081),
(-11.49239039,-1.68053601,-4.14773043,-3.54143976), (-22.25802006,-10.12843806,-2.9688831,-2.70574665), (-20.3418791,-9.4157625,-3.348587,-7.65474665)
I want to convert this to Data frame with 4 rows and 5 columns. For example, the first row contains the first element of each parenthesis.
Thanks for your contribution.
Try this:
import pandas as pd
with open("file.txt") as f:
file = f.read()
df = pd.DataFrame([{f"name{id}": val.replace("(", "").replace(")", "") for id, val in enumerate(row.split(",")) if val} for row in file.split()])
import re
import pandas as pd
with open('file.txt') as f:
data = [re.findall(r'([\-\d.]+)',data) for data in f.readlines()]
df = pd.DataFrame(data).T.astype(float)
Output:
0 1 2 3 4
0 -9.833343 -5.531373 -11.492390 -22.258020 -20.341879
1 -5.920631 -8.310108 -1.680536 -10.128438 -9.415762
2 -7.832280 -3.280625 -4.147730 -2.968883 -3.348587
3 5.553141 -6.860671 -3.541440 -2.705747 -7.654747
Your data is basically in tuple of tuples forms, hence you can easily use pass a list of tuples instead of a tuple of tuples and get a DataFrame out of it.
Your Sample Data:
text_data = ((-9.83334315,-5.92063135,-7.83228037,5.55314146),(-5.53137301,-8.31010785,-3.28062536,-6.86067081),(-11.49239039,-1.68053601,-4.14773043,-3.54143976),(-22.25802006,-10.12843806,-2.9688831,-2.70574665),(-20.3418791,-9.4157625,-3.348587,-7.65474665))
Result:
As you see it's default takes up to 6 decimal place while you have 7, hence you can use pd.options.display.float_format and set it accordingly.
pd.options.display.float_format = '{:,.8f}'.format
To get your desired data, you simply use transpose altogether to get the desired result.
pd.DataFrame(list(text_data)).T
0 1 2 3 4
0 -9.83334315 -5.53137301 -11.49239039 -22.25802006 -20.34187910
1 -5.92063135 -8.31010785 -1.68053601 -10.12843806 -9.41576250
2 -7.83228037 -3.28062536 -4.14773043 -2.96888310 -3.34858700
3 5.55314146 -6.86067081 -3.54143976 -2.70574665 -7.65474665
OR
Simply, you can use as below as well, where you can create a DataFrame from a list of simple tuples.
data = (-9.83334315,-5.92063135,-7.83228037,5.55314146),(-5.53137301,-8.31010785,-3.28062536,-6.86067081),(-11.49239039,-1.68053601,-4.14773043,-3.54143976),(-22.25802006,-10.12843806,-2.9688831,-2.70574665),(-20.3418791,-9.4157625,-3.348587,-7.65474665)
# data = [(-9.83334315,-5.92063135,-7.83228037,5.55314146),(-5.53137301,-8.31010785,-3.28062536,-6.86067081),(-11.49239039,-1.68053601,-4.14773043,-3.54143976),(-22.25802006,-10.12843806,-2.9688831,-2.70574665),(-20.3418791,-9.4157625,-3.348587,-7.65474665)]
pd.DataFrame(data).T
0 1 2 3 4
0 -9.83334315 -5.53137301 -11.49239039 -22.25802006 -20.34187910
1 -5.92063135 -8.31010785 -1.68053601 -10.12843806 -9.41576250
2 -7.83228037 -3.28062536 -4.14773043 -2.96888310 -3.34858700
3 5.55314146 -6.86067081 -3.54143976 -2.70574665 -7.65474665
wrap the tuples as a list
data=[(-9.83334315,-5.92063135,-7.83228037,5.55314146),
(-5.53137301,-8.31010785,-3.28062536,-6.86067081),
(-11.49239039,-1.68053601,-4.14773043,-3.54143976),
(-22.25802006,-10.12843806,-2.9688831,-2.70574665),
(-20.3418791,-9.4157625,-3.348587,-7.65474665)]
df=pd.DataFrame(data, columns=['A','B','C','D'])
print(df)
output:
A B C D
0 -9.833343 -5.920631 -7.832280 5.553141
1 -5.531373 -8.310108 -3.280625 -6.860671
2 -11.492390 -1.680536 -4.147730 -3.541440
3 -22.258020 -10.128438 -2.968883 -2.705747
4 -20.341879 -9.415762 -3.348587 -7.654747

How to split a DataFrame Column into Four parts using two different delimiters

I am working on a project that will take our old database system queries and map them to the new database system queries. I want to be able to read the FROM statements, split them into Database, Schema, Table, and Alias. The data looks like this:
FROM DatabaseA.SchemaA.Table1 tbl
INNER JOIN DatabaseB.SchemaC.Table13 tbl13
ON tbl.column12 = tbl.column12
I have created a dataframe from the full SQL query and extracted just the FROM statement into a new dataframe (I'm using two different functions):
# Reads file - creates dataframe
def readSQL(file_path):
# convert to a string
file_path=file_path.decode("utf-8")
# Set sql_script as a global variable
# Once sql_script is populated it can
# be called from other functions
global sql_script
# Open file
# Separate every line
df = open(file_path,'r')
lines = df.readlines()
df.close()
# print(lines)
# Create the dataframe
sql_script = pd.DataFrame(columns=('StatementDSC','ColumnTable'))
i = 0
StatementDSC = ""
ColumnTable = ""
for line in lines:
if ('SELECT' in line) or ('FROM' in line) or ('WHERE' in line):
StatementDSC = line.strip() # remove \n
else: #(',' in line) or ('.' in line):
ColumnTable = line.strip() # remove \t and /n
# Create next line
sql_script.loc[i] = [StatementDSC,ColumnTable]
i = i + 1
return sql_script
# print(sql_script)
# Reads FROM Statement breaks down tables/aliases/joins
def readFROM(full_SQL):
# Create FROM Dataframe
df = full_SQL.loc[full_SQL['StatementDSC']== 'FROM']
# Drop empty rows
# df.drop(df.ColumnTable == "",axis=0)
print(df)
# split ColumnTable by periods
# Create new Columns:
# Database, Schema, Table
split_data = df["ColumnTable"].str.split('.')
data = split_data.to_list()
names = ["Database","Schema","TableAlias"]
new_df = pd.DataFrame(data,columns=names)
print(new_df)
I can get the data to split from this:
StatementDSC ColumnTable
5 FROM
6 FROM PPDAC.PLAN.DOCTORS Docs
7 FROM INNER JOIN PPDAC.PLAN.DOCTORSINFO DocInfo
8 FROM ON Docs.DocID = DocInfo.DocID
To this:
Database Schema TableAlias
0 None None
1 PPDAC PLAN DOCTORS Docs
2 INNER JOIN PPDAC PLAN DOCTORSINFO DocInfo
How do I add another delimiter to separate these into four by using both a period and a space?
I've seen other answers, but they've been for one line and not repeating on multiple lines.
Use regex to split based on multiple delimiters, such as any delimiter in the [ brackets ], in this case . or space. See this toy example:
row1list = ['1.2.3 4']
row2list = ['1.4.5 6']
row3list = ['2.7.8 9']
df = pd.DataFrame([row1list, row2list, row3list], columns=['ColumnTable'])
df2 = df['ColumnTable'].str.split('[ .]', expand=True)
print(df2)
# 0 1 2 3
# 0 1 2 3 4
# 1 1 4 5 6
# 2 2 7 8 9

Panda module export, split data

I'm trying to read a .txt file and output the count of each letter which works, however, I'm having trouble exporting that data to .csv in a specific way.
A snippet of the code:
freqs = {}
with open(Book1) as f:
for line in f:
for char in line:
if char in freqs:
freqs[char] += 1
else:
freqs[char] = 1
print(freqs)
And for the exporting to csv, I did the following:
test = {'Book 1 Output':[freqs]}
df = pd.DataFrame(test, columns=['Book 1 Output'])
df.to_csv(r'book_export.csv', sep=',')
Currently when I run it, the export looks like this (Manually done):
However I want the output to be each individual row, so it should look something like this when I open it:
I want it to separate it from the ":" and "," into 3 different columns.
I've tried various other answers on here but most of them end up with giving ValueErrors so maybe I just don't know how to apply it like the following one.
df[[',']] = df[','].str.split(expand=True)
Use DataFrame.from_dict with DataFrame.rename_axis for set index name, then csv looks like you need:
#sample data
freqs = {'a':5,'b':2}
df = (pd.DataFrame.from_dict(freqs, orient='index',columns=['Book 1 Output'])
.rename_axis('Letter'))
print (df)
Book 1 Output
Letter
a 5
b 2
df.to_csv(r'book_export.csv', sep=',')
Or alternative is use Series:
s = pd.Series(freqs, name='Book 1 Output').rename_axis('Letter')
print (s)
Letter
a 5
b 2
Name: Book 1 Output, dtype: int64
s.to_csv(r'book_export.csv', sep=',')
EDIT:
If there are multiple frequencies change DataFrame constructor:
freqs = {'a':5,'b':2}
freqs1 = {'a':9,'b':3}
df = pd.DataFrame({'f1':freqs, 'f2':freqs1}).rename_axis('Letter')
print (df)
f1 f2
Letter
a 5 9
b 2 3

Merge text file with a csv database file using pandas

[Update my question]
I have a text file looks like below,
#File_infoomation1
#File_information2
A B C D
1 2 3 4.2
5 6 7 8.5 #example.txt separate by tab '\t' column A dtype is object
I'd like to merge the text file with a csv database file based on column E. The column contains integer.
E,name,age
1,john,23
5,mary,24 # database.csv column E type is int64
So I tried to read the text file then remove first 2 unneeded head lines.
example = pd.read_csv('example.txt', header = 2, sep = '\t')
database = pd.read_csv('database.csv')
request = example.rename(columns={'A': 'E'})
New_data = request.merge(database, on='E', how='left')
But the result does not appear the stuff I want, while it shows NaN in column name and age,
I think int64 and object dtype is where the mistake, dose anyone know how to work this out?
E,B,C,D,name,age
1,2,3,4.2,NaN,NaN
5,6,7,8.5,NaN,NaN
You just need to edit this in your code:
instead of
example = pd.read_csv('example.txt', header = 2, sep = '\t', delim_whitespace=False )
Use this:
example = pd.read_csv('example.txt', sep = ' ' ,index_col= False)
Actually I tried reading your files with:
example = pd.read_csv('example.txt', header = 2, sep = '\t')
# Renaming
example.columns = ['E','B','C','D']
database = pd.read_csv('database.csv')
New_data = example.merge(database, on='E', how='left')
And this returns:
E B C D name age
0 1 2 3 4.2 john 23
1 5 6 7 8.5 mary 24
EDIT: actually is not clear the separator of the original example.txt file. If it is space try putting sep='\s' instead sep=' ' for space.

Split the input file-format into a multiple lines list, interpolating number ranges "n-m"

I would need help to separate the csv into a list.
Here is the input file and out put file that I need.
I have a CSV file which look like this (line by line):
1-6
97
153,315,341,535
15,~1510,~1533,~1534,~1535,~1590
I need my output to be:
Col 1 Col 2
1 ~1510
2 ~1533
3 ~1534
4 ~1535
5 ~1590
6
97
153
315
341
535
15
Meaning when I detect "-" sign example 1-6 will be (1 until 6)
and separate the number with and without "~" into 2 different column
However results i get with my code is as below:
Col1 Col2 Col3 Col4 Col5 Col6
6-Jan
97
153 315 341 535
15 ~1510 ~1533 ~1534 ~1535 ~1590
my code:
import csv
with open('testing.csv') as f, open("testing1.csv", "w") as outfile:
writer = csv.writer(outfile)
f.readline() # these are headings should remove them
csv_reader = csv.reader(f, delimiter=",")
for line_list in csv_reader:
skills_list = [line_list[0].split(',')]
for skill in skills_list:
writer.writerow(skill)
Please help. Thanks A lot.
This is how I would do it. read all the data first and construct your columns. Then iterate over the columns and build your csv.
Here is code for building the columns.
import csv
fin = open('testing.csv', 'r')
column_1 = []
column_2 = []
for line in fin:
items = line.split(',')
for item in items:
if '-' in item:
num_range = item.split('-')
column_1 += range(int(num_range[0])+1, int(num_range[1])+1)
elif '~' in item:
column_2.append(item.strip())
else:
column_1.append(item.strip())
fin.close()
You cannot write output until you have read the required input. So the first output line can be written when you have obtained the input ~1510.
The simplest solution is to read the entire input file into memory, then write. I would maintain two lists, then push to the first if no tilde, otherwise to the other one. For output, then, simply loop over these lists, supplying empty values if one of them runs out.
If you need to optimize memory usage (e.g. if there is more input than you can fit into memory), maybe write each line as soon as you can and free up its memory; but this is more challenging to get right.
import itertools as it
results = {
'col1': [],
'col2': [],
}
with open('data.txt') as f:
for line in f:
line = line.rstrip()
entries = line.split(",")
for entry in entries:
if entry.startswith('~'):
column = 'col2'
entry = entry[1:]
else:
column = 'col1'
if '-' in entry:
start, end = entry.split('-')
results[column].extend(
list(range(int(start), int(end)+1))
)
else:
results[column].append(entry)
print("{} \t {}".format('Col 1', 'Col 2'))
column_iter = it.zip_longest(
results['col1'],
["~{}".format(num) for num in results['col2']],
fillvalue=''
)
for col1_num, col2_num in column_iter:
print(
"{} \t {}".format(col1_num, col2_num)
)
--output:--
Col 1 Col 2
1 ~1510
2 ~1533
3 ~1534
4 ~1535
5 ~1590
6
97
153
315
341
535
15
And with this data.txt:
1-6
~7-10,97
153,315,341,535
15,~1510,~1533,~1534,~1535,~1590
output:
Col 1 Col 2
1 ~7
2 ~8
3 ~9
4 ~10
5 ~1510
6 ~1533
97 ~1534
153 ~1535
315 ~1590
341
535
15

Categories

Resources