CSV file from txt using pandas

CSV file from txt using pandas - python

I have a txt file with info inside of it, separated for every deal with \n symbol.
DEAL: 896
CITY: New York
MARKET: Manhattan
PRICE: $9,750,000
ASSET TYPE: Rental Building
SF: 8,004
PPSF: $1,218
DATE: 11/01/2017
Is there any way to make a csv (or another) table with headers, specified like CITY, MARKET, etc. with pandas or csv module? All the info from specific title should go into corresponding header

Updated to navigate around using : as a delimiter:
import pandas as pd
new_temp = open('temp.txt', 'w') # writing data to a new file changing the first delimiter only
with open('fun.txt') as f:
for line in f:
line = line.replace(':', '|', 1) # only replace first instance of : to use this as delimiter for pandas
new_temp.write(line)
new_temp.close()
df = pd.read_csv('temp.txt', delimiter='|', header=None)
df = df.set_index([0]).T
df.to_csv('./new_transposed_df.csv', index=False)
Will make a csv with the left column as headers and the right column as data without changing colons in the second column. It will write out a temp file called temp.txt which you can remove after you run the program.

Use Pandas to input it and then transform/pivot your table.
import pandas as pd
df = pd.read_csv('data.txt',sep=':',header=None)
df = df.set_index(0).T
Example
import pandas as pd
data = '''
DEAL: 896
CITY: New York
MARKET: Manhattan
PRICE: $9,750,000
ASSET TYPE: Rental Building
SF: 8,004
PPSF: $1,218
DATE: 11/01/2017
'''
df = pd.read_csv(pd.compat.StringIO(data),sep=':',header=None)
print(df.set_index(0).T)
Results:

Related

how to open several nc files, filter a desired period and write the result in a single txt file?

I have several nc files, I need to open them all. Filter the desired period and write the result of all in a single txt file.
The nc files correspond to a month (Jan, Feb, Mar...) and have four variables (temperature, dew point, u and v).
I need to assemble a table, with all the variables side by side for a specific period. For example, from January to October. The first column being temperature, the second dew point, third u and lastly v.
from netCDF4 import MFDataset
import pandas as pd
import xarray as xr
import csv
import tempfile
ds=xr.open_mfdataset('/home/milena/Documentos/dados_obs_haroldo/media_horaria/MEDIA_HORARIA_*.nc')
lat = ds.variables['lat'][:]
lon = ds.variables['lon'][:]
t2mj = ds.variables['t2mj'][:]
td2mj = ds.variables['td2mj'][:]
u10mj = ds.variables['u10mj'][:]
v10mj = ds.variables['v10mj'][:]
#Brasilia
t2mj_txt=ds.t2mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj.csv')
td2mj_txt=ds.td2mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj.csv')
u10mj_txt=ds.u10mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj.csv')
v10mj_txt=ds.v10mj.isel(lat=153, lon=117).to_dataframe().to_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj.csv')
#print(t2mj_txt)
#opem csv
t2mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj.csv', skipinitialspace=True)
td2mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj.csv', skipinitialspace=True)
u10mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj.csv', skipinitialspace=True)
v10mj_csv = pd.read_csv('/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj.csv', skipinitialspace=True)
#print(t2mj_csv)
#filter desired period
t2mj_date=t2mj_csv[(t2mj_csv['time'])<"2022-12-01"]
td2mj_date=td2mj_csv[(td2mj_csv['time'])<"2022-12-01"]
u10mj_date=u10mj_csv[(u10mj_csv['time'])<"2022-12-01"]
v10mj_date=v10mj_csv[(v10mj_csv['time'])<"2022-12-01"]
arquivo = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj_filter.txt", "w")
arquivo.write(t2mj_date['t2mj'].to_string())
arquivo.close()
arquivo2 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj_filter.txt", "w")
arquivo2.write(td2mj_date['td2mj'].to_string())
arquivo2.close()
arquivo3 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj_filter.txt", "w")
arquivo3.write(u10mj_date['u10mj'].to_string())
arquivo3.close()
arquivo4 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj_filter.txt", "w")
arquivo4.write(v10mj_date['v10mj'].to_string())
arquivo4.close()
file_list=['/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/t2mj_filter.txt', '/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/td2mj_filter.txt', '/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/u10mj_filter.txt', '/home/milena/Documentos/dados_obs_haroldo/media_horaria/csv/v10mj_filter.txt']
dfe = pd.DataFrame()
for file in file_list:
temp_dfe = pd.read_csv(file, header=None, names=[file[:-4]])
dfe = pd.concat([dfe, temp_dfe], axis=1)
arquivo5 = open("/home/milena/Documentos/dados_obs_haroldo/media_horaria/teste.txt", "w")
arquivo5.write(dfe.to_string())
arquivo5.close()
my result looks like this:
enter image description here
I would like it to look like this:
enter image description here

The beauty of Xarray is that you don't need to do all this extra IO to make this problem work. Just use xarray's selection logic then convert the dataset to a dataframe before writing a multi-column text file.
ds = xr.open_mfdataset(...)
# select point, select time slice, drop lat/lon variables
ds_point = ds.isel(lat=153, lon=117).sel(time=slice(None, '2022-12-01')).drop(['lat', 'lon'])
# convert to dataframe
df_point = ds_point.to_dataframe()
# order columns then write to text file with tab separator
variables = ['t2mj', 'td2mj', 'u10mj', 'v10mj']
df_point[variables].to_csv(..., sep='\t')

Compare a column between 2 csvs and merge data into one csv

I am trying to print out a csv file by comparing a column between two csv files:
CSV1:
network, geoname
1.0.0.0/24, 123456
2.0.0.0/24, 76890
.....
CSV2:
geoname, country_code, country
123456, XX, ABC
....
....
I want to compare the geoname column between csv 1 & 2 and map the network section according to the geoname and associated country and country code.
Final CSV:
network, geoname, country code, country
1.0.0.0/24,123456, XX, ABC
NB: csv1 contains duplicate geonames as well while csv2 maps the geonames to the country.
I am trying to map the network section in csv1 using the geonames and get the associated country and code in my final csv.
The current problem i am facing is that the code will only run until the 2nd CSV file finishes and hence, I am not able to map things properly.
#script for making GeipCountrywhois csv for input to zmap
from asyncore import write
import csv
import os
import pandas as pd
import numpy as np
import sys
indir=os.environ['HOME']+'/code/surveys/mmdb/GeoLite2-Country-CSV_20220308/'
v4file=indir+'GeoLite2-Country-Blocks-IPv4.csv'
localefile = indir+'GeoLite2-Country-Locations-en.csv'
outfile = indir+'GeoIPCountryWhois.csv'
data = []
data2 = []
data3 = []
with open(v4file, "r") as file:
reader = csv.reader(file)
of = open(outfile, "w")
writer = csv.writer(of)
for row in reader:
ip_cidr = row[0]
geoname =row[1]
data = [ip_cidr, geoname]
#print(data)
with open(localefile, "r") as file:
reader2 = csv.reader(file)
for row in reader2:
geoname_en = row[0]
cc = row[4]
country = row[5]
data2 = [geoname, cc, country]
#print(data2)
if(data[1] == data2[0]):
data3 = [ip_cidr, geoname, cc, country]
writer.writerow(data3)
print(data3)

I don't know if you are using python pandas or not... I can put a pandas solution for you.
At first install pandas in your python environment if pandas is not installed in your system before. You can using pandas using the following command -
python -m pip install pandas
Then try this code block -
import pandas as pd
# load csv1, assume that its name is network.csv
network = pd.read_csv('network.csv')
# Then load csv2, assume that its name is geonames.csv
geo = pd.read_csv('geonames.csv')
# create a geo mapper
final_csv = network.merge(geo, how='left', on='geoname')
# save result as csv
final_csv.to_csv('final.csv', index=False)

I need to split one column in csv file into two columns using python

Hello everyone I am learning python I am new I have a column in a csv file with this example of value:
I want to divide the column programme based on that semi column into two columns for example
program 1: H2020-EU.3.1.
program 2: H2020-EU.3.1.7.
This is what I wrote initially
import csv
import os
with open('IMI.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
with open('new_IMI.csv', 'w') as new_file:
csv_writer = csv.writer(new_file, delimiter='\t')
#for line in csv_reader:
# csv_writer.writerow(line)
please note that after i do the split of columns I need to write the file again as a csv and save it to my computer
Please guide me

Using .loc to iterate through each row of a dataframe is somewhat inefficient. Better to split an entire column, with the expand=True to assign to the new columns. Also as stated, easy to use pandas here:
Code:
import pandas as pd
df = pd.read_csv('IMI.csv')
df[['programme1','programme2']] = df['programme'].str.split(';', expand=True)
df.drop(['programme'], axis=1, inplace=True)
df.to_csv('IMI.csv', index=False)
Example of output:
Before:
print(df)
id acronym status programme topics
0 945358 BIGPICTURE SIGNED H2020-EU.3.1.;H2020-EU3.1.7 IMI2-2019-18-01
1 821362 EBiSC2 SIGNED H2020-EU.3.1.;H2020-EU3.1.7 IMI2-2017-13-06
2 116026 HARMONY SIGNED H202-EU.3.1. IMI2-2015-06-04
After:
print(df)
id acronym status topics programme1 programme2
0 945358 BIGPICTURE SIGNED IMI2-2019-18-01 H2020-EU.3.1. H2020-EU3.1.7
1 821362 EBiSC2 SIGNED IMI2-2017-13-06 H2020-EU.3.1. H2020-EU3.1.7
2 116026 HARMONY SIGNED IMI2-2015-06-04 H2020-EU.3.1. None

You can use pandas library instead of csv.
import pandas as pd
df = pd.read_csv('IMI.csv')
p1 = {}
p2 = {}
for i in range(len(df)):
if ';' in df['programme'].loc[i]:
p1[df['id'].loc[i]] = df['programme'].loc[i].split(';')[0]
p2[df['id'].loc[i]] = df['programme'].loc[i].split(';')[1]
df['programme1'] = df['id'].map(p1)
df['programme2'] = df['id'].map(p2)
and if you want to delete programme column:
df.drop('programme', axis=1)
To save new csv file:
df.to_csv('new_file.csv', inplace=True)

How do I return all lines in a csv containing specific words or phrases in a specific column?

I have a csv file containing a data set (in this case addresses). I would like to make a second csv file containing only the entries which have one of a set of phrases in a specific column. For example I would like to return all the people who currently live in "Viridian" but not those who previously lived there or never lived there.
The example data is:
First Name,Second Name,ID,Home Town,County,Current Town,Street
Sam,Smith,1234,Pallet,North,Orange,Lemon
Jenny,Walton,1456,Viridian,West,York,High View
Alan,Kirk,2378,Orange,West,Viridian,High street
Reese,Small,9840,Minsk,East,Viridian,Ocean Avenue
Audry,Owen,7865,York,South,Blackmarsh,8th Street
Marco,Jefferson,1580,Amsterdam,Central,Oxford,Church Road
Jim,Lowe,5218,Windy City,East,Windy City,Oak
Gillian,Pope,3217,Rome,Central,Rome,Low road
I have previously used this code:
town = ["Viridian", "Rome"]
with open("addresses.csv",) as oldfile, open("Filtered addresses.csv", "w") as newfile:
for line in oldfile:
if any(town in line.strip().lower() for town in town):
newfile.write(line)
However, this returns lines with the specified cities in all columns - I just want the ones with the specified cities in the column "current town".
I tried this instead:
import csv
town = ["Viridian", "Rome"]
with open("Filtered addresses.csv", "w", encoding="Latin-1") as newfile:
reader = csv.reader(open("addresses.csv", 'r', encoding="Latin-1"))
for data in reader:
if any(town in data[6] for town in town):
newfile.write(data)
But this results in an error:
TypeError: write() argument must be str, not list
While altering the code to read:
newfile.write(str(data))
returns some entries but they are formatted as a single long line rather than rows.
What is the best way to achieve my aim? I would like to keep the full row of data in each case.
Thanks!

pandas will make it extremely easy:
import pandas as pd
town = ["Viridian", "Rome"]
# Read csv as pandas dataframe
original = pd.read_csv("addresses.csv", index_col=False)
# Select rows where `Current Town` column's value is in `town`
filtered = original[original['Current Town'].isin(town)]
# Save the filtered dataframe to a file
filtered.to_csv("Filtered addresses.csv")
If you don't have pandas installed, you can easily install it running:
pip install pandas
in your command line

import csv
town = ["Viridian", "Rome"]
with open("Filtered addresses.csv", "w", encoding="Latin-1") as newfile:
reader = csv.reader(open("addresses.csv", 'r', encoding="Latin-1"))
csvwriter = csv.writer(newfile)
for data in reader:
if data[6] in town:
csvwriter.writerow(data)

how to choose which column to write in (.csv) in python

import csv
f = csv.reader(open('lmt.csv','r')) # open input file for reading
Date, Open, Hihh, mLow, Close, Volume = zip(*f) #s plit it into separate columns
ofile = open("MYFILEnew1.csv", "wb") # output csv file
c = csv.writer(ofile)
item = Date
item2 = Volume
rows = zip(item, item)
i = 0
for row in item2:
print row
writer = csv.writer(ofile, delimiter='\t')
writer.writerow([row])
ofile.close()
Above is what I have produced so far.
As you can see in the 3rd line, I have extracted 6 columns from a spreadsheet.
I want to create a .csv file under the name of MYFILEnew1.csv which only has two columns, Date and Volume.
What I have above creates a .csv that only writes Volume column into the first column of the new .csv file.
How would you go about placing Date into the second column?
For example
Date Open High Low Close Volume
17-Feb-16 210 212.97 209.1 212.74 1237731
is what i have. and Id like to produce a new csv file such that it has
Date Volume
17-Feb-16 1237731

If I understand you question correctly, you can achieve that very easily using panda's read_csv and to_csv (#downvoter: Could you explain your downvote, please!?); the final solution to your problem can be found below EDIT2:
import pandas as pd
# this assumes that your file is comma separated
# if it is e.g. tab separated you should use pd.read_csv('data.csv', sep = '\t')
df = pd.read_csv('data.csv')
# select desired columns
df = df[['Date', 'Volume']]
#write to the file (tab separated)
df.to_csv('MYFILEnew1.csv', sep='\t', index=False)
So, if your data.csv file looks like this:
Date,Open,Hihh,mLow,Close,Volume
1,5,9,13,17,21
2,6,10,14,18,22
3,7,11,15,19,23
4,8,12,16,20,24
The the MYFILEnew1.csv would look like this after running the script above:
Date Volume
1 21
2 22
3 23
4 24
EDIT
Using your data (tab separated, stored in the file data3.csv):
Date Open Hihh mLow Close Volume
17-Feb-16 210 212.97 209.1 212.74 1237731
Then
import pandas as pd
df = pd.read_csv('data3.csv', sep='\t')
# select desired columns
df = df[['Date', 'Volume']]
# write to the file (tab separated)
df.to_csv('MYFILEnew1.csv', sep='\t', index=False)
gives the desired output
Date Volume
17-Feb-16 1237731
EDIT2
Since your header in your input csv file seems to be messed up (as discussed in the comments), you have to rename the first column. The following now works fine for me using your entire dataset:
import pandas as pd
df = pd.read_csv('lmt.csv', sep=',')
# get rid of the wrongly formatted column name
df.rename(columns={df.columns[0]: 'Date' }, inplace=True)
# select desired columns
df = df[['Date', 'Volume']]
# write to the file (tab separated)
df.to_csv('MYFILEnew1.csv', sep='\t', index=False)

Here I would suggest using the csv module's csv.DictReader object to read and write from the files. To read the file, you would do something like
import csv
fieldnames=('Date', 'Open', 'High', 'mLow', 'Close', 'Volume')
with open('myfilename.csv') as f:
reader = csv.DictReader(f, fieldnames=fieldnames)
Beyond this, you will just need to filter out the keys you don't want from each row and similarly use the csv.DictWriter class to write to your export file.

You were so close:
import csv
f = csv.reader(open('lmt.csv','rb')) # csv is binary
Date, Open, Hihh, mLow, Close, Volume = zip(*f)
rows = zip(Date, Volume)
ofile = open("MYFILEnew1.csv", "wb")
writer = csv.writer(ofile)
for row in rows:
writer.writerow(row) # row is already a tuple so no need to make it a list
ofile.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

CSV file from txt using pandas - python

Related

how to open several nc files, filter a desired period and write the result in a single txt file?

Compare a column between 2 csvs and merge data into one csv

I need to split one column in csv file into two columns using python

How do I return all lines in a csv containing specific words or phrases in a specific column?

how to choose which column to write in (.csv) in python

Categories

Resources