How to add entire columns to selected cells in template in Python? - python

Problem
With help from stack overflow, I can now append data to an existing Excel sheet on Amazon Web Service's S3. My current problem is that I want to add columns from a dataframe to multiple cells in the Excel sheet. At the moment, I can only add them individually. The main part in the reprex below where I need help is where it says "THIS IS WHERE I NEED HELP":-)
Reprex
#load packages
from io import BytesIO
from tempfile import NamedTemporaryFile
import boto3
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
# Load Template from S3
bucket_name="main_folder"
object_key="sub_folder/template.xlsx"
bucket_object = boto3.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
# Input Data
data_input = {
'Area': ['North', 'North', 'North', 'South', 'South', 'South', 'West', 'West', 'West', 'East', "East", "East"],
"Sub-Area": ["North2", "North1", "North2", "South2", "South1", "South2", "West3", "West9", "West9", "East1",
"East4", "East1"],
"Workers": [1, 20, 30, 2, 33, 5, 3, 6, 44, 1, 11, 111],
"Job1": ["T", "T", "T", "X", "T", "T", "T", "X", "T", "X", "T", "T"],
"Job2": ["F", "X", "T", "X", "T", "F", "T", "X", "F", "X", "T", "T"],
"Job3": ["T", "F", "T", "X", "X", "F", "F", "T", "X", "X", "T", "T"]}
# Create DataFrame
df = pd.DataFrame(data_input)
# Load Workbook
wb = load_workbook(filename=(BytesIO(content)))
ws = wb['Sheet1']
# THIS IS WHERE I NEED HELP
#Change contents
ws["A2"] = df1["Area"][0]
ws["A3"] = df1["Area"][1]
ws["A4"] = df1["Area"][2]
ws["A5"] = df1["Area"][3]
...
ws["A14"] = df1["Area"][11]
ws["D2"] = df1["Sub-Area"][0]
ws["D3"] = df1["Sub-Area"][1]
ws["D4"] = df1["Sub-Area"][2]
...
ws["D14"] = df1["Sub-Area"][11]
etc.
# Save Workbook back to S3
s3 = boto3.client('s3')
with NamedTemporaryFile() as tmp:
filename = '/tmp/{}'.format("template.xlsx")
wb.save(filename)
s3.upload_file(Bucket =bucket_name, Filename = "template.xlsx", Key=object_key)
Help
Is there an easier way of doing this?

Related

How do I load a dataframe into an Excel template on Amazon Web Service's S3?

Issue
I have a dataframe. The template I want to use only has column headings. The column headings in the dataframe are identical to the template column headings. How do I paste the contents of the dataframe into the template excel sheet?
Reprex
Example dataframe
import pandas as pd
data_input = {'Area':['North', 'North', 'North', 'South', 'South', 'South', 'West', 'West', 'West', 'East', "East","East"],
"Sub-Area": ["North2", "North1", "North2", "South2", "South1", "South2","West3", "West9", "West9", "East1", "East4", "East1"],
"Workers": [1,20,30, 2,33,5, 3,6,44, 1, 11, 111],
"Job1":["T", "T", "T", "X","T", "T", "T", "X", "T", "X","T", "T"],
"Job2":["F", "X", "T", "X","T", "F", "T", "X", "F", "X","T", "T"],
"Job3":["T", "F", "T", "X","X", "F", "F", "T","X", "X","T", "T"]}
# Create DataFrame
df1 = pd.DataFrame(data_input)
Attempt #1
# Save dataframe to the template file on S3
with io.BytesIO() as output:
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df1.to_excel(writer, sheet_name='Sheet1',startcol = 0, startrow=2)
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket('main_folder').put_object(Key='sub_folder/template.xlsx', Body=data)
Problem: The above solution just writes my dataset over the template file.
Attempt #2: Appending the dataframe via mode = "a"
# Save dataframe to the template file on S3
# Save file to S3
with io.BytesIO() as output:
# here I add mode = "a"
with pd.ExcelWriter(output, engine='openpyxl', mode = "a") as writer:
df1.to_excel(writer, sheet_name='Sheet1',startcol = 0, startrow=2)
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket('main_folder').put_object(Key='sub_folder/template.xlsx', Body=data)
Problem: Error Message
BadZipFile: File is not a zip file
Attempt 3
In response to a comment from jsn, I tried to first append the df to the template and then load that to S3 but it overwrote all the formatting of the template again.
#downloading template
template = pd.read_excel('s3://main_folder/sub_folder/template.xlsx', sheet_name="Sheet1")
#appending the dataframe
template = template.append(df1)
#now loading to S3
with io.BytesIO() as output:
with pd.ExcelWriter(output, engine='openpyxl') as writer:
template.to_excel(writer, sheet_name='Sheet1')
data = output.getvalue()
s3 = boto3.resource('s3')
s3.Bucket('main_folder').put_object(Key='sub_folder/template.xlsx', Body=data)
Any help would be appreciated
The pandas library may not be suited to store xlsx formatting state.
The alternative here could be openpyxl library which lets you load a workbook and does integrate with pandas to append your data.
You could attempt to do something like this:
from io import BytesIO
from tempfile import NamedTemporaryFile
import boto3
import pandas as pd
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
# Load Template from S3
bucket_object = boto3.resource('s3').Bucket(bucket_name).Object(object_key)
content = bucket_object.get()['Body'].read()
# Input Data
data_input = {
'Area': ['North', 'North', 'North', 'South', 'South', 'South', 'West', 'West', 'West', 'East', "East", "East"],
"Sub-Area": ["North2", "North1", "North2", "South2", "South1", "South2", "West3", "West9", "West9", "East1",
"East4", "East1"],
"Workers": [1, 20, 30, 2, 33, 5, 3, 6, 44, 1, 11, 111],
"Job1": ["T", "T", "T", "X", "T", "T", "T", "X", "T", "X", "T", "T"],
"Job2": ["F", "X", "T", "X", "T", "F", "T", "X", "F", "X", "T", "T"],
"Job3": ["T", "F", "T", "X", "X", "F", "F", "T", "X", "X", "T", "T"]}
# Create DataFrame
df = pd.DataFrame(data_input)
# Load Workbook
wb = load_workbook(filename=(BytesIO(content)))
ws = wb['Sheet1']
# Append contents of Input Data to Workbook
for r in dataframe_to_rows(df, index=False, header=False):
ws.append(r)
# Save Workbook back to S3
with NamedTemporaryFile() as tmp:
filename = '/tmp/{}'.format(object_key)
wb.save(filename)
s3_resource.Bucket(bucket_name).upload_file(Filename=filename, Key=dest_filename)
Reference material as follows:
https://openpyxl.readthedocs.io/en/stable/pandas.html
https://danh-was-here.netlify.app/save-excel-workbook-to-aws-s3-with-python/
Note:
This was tested locally (i.e. without AWS) and the output suggested that formatting applied to the heading columns in the template file remained even after the new data was added.

Combine Bar and line plot in plotly

I have this kind of data:
qq=df = pd.DataFrame(
{
"Predicted": np.sort(np.random.uniform(3, 15, 4)),
"real": np.sort(np.random.uniform(3, 15, 4)),
"Category":['A','B','C','D'],
"new_val":np.random.uniform(3,15,4)
}
)
I am plotting Bar plot:
I want to add this plot line plot of 'real' variable.
I am using the following command:
px.bar(qq, x=qq['Category'], y=['Predicted', 'real', 'new_val'], title="Long-Form Input").add_trace(px.line(x=qq['Category'], y=qq['real']))
But this gives me the error:
Where am I wrong?
you want to add the traces from px.line() not the figure. Hence .data
have also updated the traces from px.line() so it will show in the legend
import pandas as pd
import plotly.express as px
qq = pd.DataFrame(
{
"Predicted": np.sort(np.random.uniform(3, 15, 4)),
"real": np.sort(np.random.uniform(3, 15, 4)),
"Category": ["A", "B", "C", "D"],
"new_val": np.random.uniform(3, 15, 4),
}
)
px.bar(
qq, x="Category", y=["Predicted", "real", "new_val"], title="Long-Form Input"
).add_traces(
px.line(qq, x="Category", y="real").update_traces(showlegend=True, name="real").data
)
second yaxis
Update per comments
import pandas as pd
import plotly.express as px
qq = pd.DataFrame(
{
"Predicted": np.sort(np.random.uniform(3, 15, 4)),
"real": np.sort(np.random.uniform(3, 15, 4)),
"Category": ["A", "B", "C", "D"],
"new_val": np.random.uniform(3, 15, 4),
}
)
px.bar(
qq, x="Category", y=["Predicted", "real", "new_val"], title="Long-Form Input"
).add_traces(
px.line(qq, x="Category", y="real").update_traces(showlegend=True, name="real", yaxis="y2").data
).update_layout(yaxis2={"side":"right", "overlaying":"y"})

How to create MongoDB Time Series Collection using pymongo

The documentation shows how to do it with mongosh, but how do you create Time Series Collection using pymongo from within a python script?
import pymongo
import time
from datetime import datetime
client = pymongo.MongoClient()
db = client['time-series-db']
col = db['time-series-col']
# ... do something here to make it 'time-series collection' ...
js = {
"1": "A",
"2": "B",
"3": "C",
"4": "D",
"5": "E",
}
# create BSON type datetime object needed for 'time-series collection'
ts = time.time()
js['timestamp'] = datetime.utcfromtimestamp(ts)
col.insert_one(js)
You can try this:
conn = pymongo.MongoClient('mongodb://localhost')
db = conn.testDB
db.create_collection('testColl', timeseries={ 'timeField': 'timestamp' })
# - OR -
db.command('create', 'testColl', timeseries={ 'timeField': 'timestamp', 'metaField': 'data', 'granularity': 'hours' })
General Reference: Time Series Collections

Python DataReader - Update with new information

import pandas as pd
from pandas_datareader import data as wb
tickers = ["MMM", "ABT", "ABBV", "ABMD", "ACN", "ATVI", "ADBE", "AMD", "AAP", "AES", "AFL", "A", "APD", "AKAM", "ALK", "ALB", "ARE", "ALXN", "ALGN", "ALLE", "LNT", "ALL", "GOOGL", "GOOG", "MO", "AMZN", "AMCR", "AEE", "AAL", "AEP", "AXP", "AIG", "AMT", "AWK", "AMP", "ABC", "AME", "AMGN", "APH", "ADI", "ANSS", "ANTM", "AON", "AOS", "APA", "AAPL", "AMAT", "APTV", "ADM", "ANET", "AJG", "AIZ", "T", "ATO", "ADSK", "ADP", "AZO", "AVB", "AVY", "BKR", "BLL", "BAC", "BK", "BAX", "BDX", "BBY", "BIO", "BIIB", "BLK", "BA", "BKNG", "BWA", "BXP", "BSX", "BMY", "AVGO", "BR", "CHRW", "COG", "CDNS", "CZR", "CPB", "COF", "CAH", "KMX", "CCL", "CARR", "CTLT", "CAT", "CBOE", "CBRE", "CDW", "CE", "CNC", "CNP", "CERN", "CF", "SCHW", "CHTR", "CVX", "CMG", "CB", "CHD", "CI", "CINF", "CTAS", "CSCO", "C", "CFG", "CTXS", "CLX", "CME", "CMS", "KO", "CTSH", "CL", "CMCSA", "CMA", "CAG", "COP", "ED", "STZ", "COO", "CPRT", "GLW", "CTVA", "COST", "CCI", "CSX", "CMI", "CVS", "DHI", "DHR", "DRI", "DVA", "DE", "DAL", "XRAY", "DVN", "DXCM", "FANG", "DLR", "DFS", "DISCA", "DISCK", "DISH", "DG", "DLTR", "D", "DPZ", "DOV", "DOW", "DTE", "DUK", "DRE", "DD", "DXC", "EMN", "ETN", "EBAY", "ECL", "EIX", "EW", "EA", "EMR", "ENPH", "ETR", "EOG", "EFX", "EQIX", "EQR", "ESS", "EL", "ETSY", "EVRG", "ES", "RE", "EXC", "EXPE", "EXPD", "EXR", "XOM", "FFIV", "FB", "FAST", "FRT", "FDX", "FIS", "FITB", "FE", "FRC", "FISV", "FLT", "FLIR", "FMC", "F", "FTNT", "FTV", "FBHS", "FOXA", "FOX", "BEN", "FCX", "GPS", "GRMN", "IT", "GNRC", "GD", "GE", "GIS", "GM", "GPC", "GILD", "GL", "GPN", "GS", "GWW", "HAL", "HBI", "HIG", "HAS", "HCA", "PEAK", "HSIC", "HSY", "HES", "HPE", "HLT", "HFC", "HOLX", "HD", "HON", "HRL", "HST", "HWM", "HPQ", "HUM", "HBAN", "HII", "IEX", "IDXX", "INFO", "ITW", "ILMN", "INCY", "IR", "INTC", "ICE", "IBM", "IP", "IPG", "IFF", "INTU", "ISRG", "IVZ", "IPGP", "IQV", "IRM", "JKHY", "J", "JBHT", "SJM", "JNJ", "JCI", "JPM", "JNPR", "KSU", "K", "KEY", "KEYS", "KMB", "KIM", "KMI", "KLAC", "KHC", "KR", "LB", "LHX", "LH", "LRCX", "LW", "LVS", "LEG", "LDOS", "LEN", "LLY", "LNC", "LIN", "LYV", "LKQ", "LMT", "L", "LOW", "LUMN", "LYB", "MTB", "MRO", "MPC", "MKTX", "MAR", "MMC", "MLM", "MAS", "MA", "MKC", "MXIM", "MCD", "MCK", "MDT", "MRK", "MET", "MTD", "MGM", "MCHP", "MU", "MSFT", "MAA", "MHK", "TAP", "MDLZ", "MPWR", "MNST", "MCO", "MS", "MOS", "MSI", "MSCI", "NDAQ", "NTAP", "NFLX", "NWL", "NEM", "NWSA", "NWS", "NEE", "NLSN", "NKE", "NI", "NSC", "NTRS", "NOC", "NLOK", "NCLH", "NOV", "NRG", "NUE", "NVDA", "NVR", "NXPI", "ORLY", "OXY", "ODFL", "OMC", "OKE", "ORCL", "OTIS", "PCAR", "PKG", "PH", "PAYX", "PAYC", "PYPL", "PENN", "PNR", "PBCT", "PEP", "PKI", "PRGO", "PFE", "PM", "PSX", "PNW", "PXD", "PNC", "POOL", "PPG", "PPL", "PFG", "PG", "PGR", "PLD", "PRU", "PEG", "PSA", "PHM", "PVH", "QRVO", "PWR", "QCOM", "DGX", "RL", "RJF", "RTX", "O", "REG", "REGN", "RF", "RSG", "RMD", "RHI", "ROK", "ROL", "ROP", "ROST", "RCL", "SPGI", "CRM", "SBAC", "SLB", "STX", "SEE", "SRE", "NOW", "SHW", "SPG", "SWKS", "SNA", "SO", "LUV", "SWK", "SBUX", "STT", "STE", "SYK", "SIVB", "SYF", "SNPS", "SYY", "TMUS", "TROW", "TTWO", "TPR", "TGT", "TEL", "TDY", "TFX", "TER", "TSLA", "TXN", "TXT", "TMO", "TJX", "TSCO", "TT", "TDG", "TRV", "TRMB", "TFC", "TWTR", "TYL", "TSN", "UDR", "ULTA", "USB", "UAA", "UA", "UNP", "UAL", "UNH", "UPS", "URI", "UHS", "UNM", "VLO", "VAR", "VTR", "VRSN", "VRSK", "VZ", "VRTX", "VFC", "VIAC", "VTRS", "V", "VNO", "VMC", "WRB", "WAB", "WMT", "WBA", "DIS", "WM", "WAT", "WEC", "WFC", "WELL", "WST", "WDC", "WU", "WRK", "WY", "WHR", "WMB", "WLTW", "WYNN", "XEL", "XLNX", "XYL", "YUM", "ZBRA", "ZBH", "ZION", "ZTS"]
financial_data = pd.DataFrame()
for t in tickers:
financial_data[t] = wb.DataReader(t, data_source='yahoo', start = '1995-1-1')["Adj Close"]
financial_data.to_excel("Financial Data.xlsx")
I am using Datareader to gather some stock info. I am grabbing a lot of info (from 1995 to 2021) and then I export it to Excel. I was wondering if there is a way, let's say tomorrow, to speed up the update of the information, instead of running the whole script on Python from start to bottom, since my goal tomorrow would just be to have a single new line on the whole Excel file. If I just execute the script, it will override the Excel file + add a new line of info. This seems pretty ineffective, and I was wondering if there's a way to "tell the script" I am just looking for tomorrow's info, instead of "telling it" to grab me again the information starting from 1995.
Thanks.
I don't know exactly how pandas works, but I would say it does lazy fast loading and this is not very computationally expensive. The costly thing is to operate with each loaded data. Then I think that in your case if the data is ordered by dates in increasing order, it would be enough to have a variable called timestamp_toStart initialized the first time to '1995-1-1' and that after this, after the first execution it is updated to the last value of the last date read. You could save this value in a file and reread it and load it every time you rerun the script.
financial_data = pd.DataFrame()
#load timestamp_toStart from the file here
for t in tickers:
financial_data[t] = wb.DataReader(t, data_source='yahoo', start = timestamp_toStart)["Adj Close"]
timestamp = wb.DataReader(t, data_source='yahoo', start = timestamp_toStart)["MMM"] #Not Sure about the correct syntax
timestamp_toStart = timestamp
#Save in a file timestamp_toStart
financial_data.to_excel("Financial Data.xlsx")

Reading each column of CSV File as a key of dictionary

I am working with googlemaps API with python and would like to read a csv file containing LOCATIONID, LOCATION, X, Y, LAT, LNG, REF, ID with each of those as a dictionary key
Excerpt from Data Set
ChIJWzKeTARawokRjoqe_C9poOA,THEGIRLSOUTFITS,0,17,40.71380629999999,-74.0222972,dc1885d8c12ac669e9de3a73fedfb08c40deefd3,CmRSAAAA4Mvy3PVnmnYQKIZg-VX9-UrnrhlwOs7YFeY40gdXw1JfuUeEZndlhWxoIdI2K0nt1voCQDg2mqUVKV6EBgOKMTrHjDsMHOy7MJYBCUWguzLroifP1UMWTYSfUJD6E6sMEhB3psbJCGcDh5iS1PHyX4k5GhRBXigoOyADpURVG3NG2bIqXJ2i_g
ChIJDaBbDPlZwokRuHXkJGK9ER0,NYNJ State Line,0,29,40.7245721,-74.02168689999999,3d2cfc0a35b583760187c2369bb957b5a8cd9755,CmRRAAAAN25LD8IWAiu8CAIFM2eMS7eJVReV_UHzmR6sbaRkhDlJgiX-wK0Xlb85hC1qmDkarwQrGoM2u60zfSMr1MxcyzMAg3UDgL6nYwwu3WgG1aNlbpiqR2HlLFlK5XhimqI4EhDrfqXHOxKeztoT01Aicp3hGhSETo6Pj7gTk-dajA0RWco1djlrVA
ChIJQX6hfdlZwokRoCdPrD-8wSY,Shipyard Marina,0,59,40.751581,-74.021287,5f5e1aa75a39ce1880085d5094694a9ceda1ff43,CmRRAAAAp_VV6L2SQgnjC_TeMYRe_m2EYyjo_IdAOUqSeuRM9kbEMHSyguWco9jDV6rdYxk5s1E5s21BX0P7flWfK2LCegcKi1m_AOp2LuYtuzv0ql4-Jeq6-yx50Z180J9U4nrHEhAihnIm2i8MWwk7ATToiLBgGhR-w02BFnO9EGniCms-XWq00VrGOg
ChIJi3MPPNNZwokRDu8nE0ldrwk,Lincoln Harbor,0,68,40.759708,-74.02225,a3192a26d0a5a779da702a7f871612992e26f606,CmRRAAAAGcYQNoc-WgRCwh3rlBzeEmlG_XQgk_coHa4qUpzWU_9DiDWnU8eLjhInrZqpR7nKibXc0dcZIYiZEb-ehKvIrF6OfbLGrpmbf47YNaofAebceKirPZ_g6jMsc-_dcITaEhAKqkWagpXbgd3CcQf8Rtz-GhQtzlAGJlP7gOzV2KfXRGFIF2ZbAA
ChIJq35iOdNZwokRKvT9_KFP8jg,Lincoln Harbor,0,68,40.75984079999999,-74.0223211,38ea8de2bbf6d8a710bec3104d180d71b3d00735,CmRRAAAASNnGG2MHHIDGcXfgH-8iWUQOEahJdMZz6dy24azZrqyXqEtMb_yP1MJRGMV6Cp6lX05MgU2vNIfHcOGBfXiG0yWU7Qe0-t9_Uhm7JxNuAOEge9ZaCwNlrtxuxr8xSYsJEhAkL9vA29DV4nReD4e85D1TGhTbZYRfvNNRZi5a3NMUA0gyMYoR_A
ChIJ07hkOtNZwokRfY7FzKrxTkA,852858 Harbor Blvd,0,68,40.7599265,-74.02213569999999,29cfd2338edc10f94f5464b2fe152ab7bab3fca6,CmRbAAAAdlTDQx7qWhNnvgDQ_1afttKRpJ_MzBULrJLNX0TFEwpvKTjCvCZoXfV1F0A_diKH5D8qDjjucrjUC3gV8iVxqOYoHRGYwifE2FSiVycUrjunQxiDYAd-C6Q7QOqswAomEhAewuH-Q09vSAW_hNbQ0AUOGhQqRXCRrgxc6EAl2xRv1Z73M4MMug
ChIJnYuB1ixYwokR3w26K1mr3JQ,1500 Harbor Blvd,0,69,40.7609258,-74.02135559999999,ed57711af76a2534406a70f21d8a7119170b0f72,CmRbAAAAbPsHkeqeX3cx5MdED0Ao6ic0PE56xhGMdda0zXolIU7KsPmgPbT1CjXfyZ0p4ws_GYgdORZB3qP0idFvWBtMpdYpXCq0VknITsvfAoNfwYXoccYGsP7QvfpBPb1oTnywEhDwyrKX9JiQkYx9YaIXhkOwGhRCtFkL2futi7BkUxo_dMYEtsWVWA
ChIJk7Y29CxYwokRLcD4dsTPCCM,800816 Waterfront Terrace,0,71,40.7629756,-74.0223474,727cf2af87d28ec3a45f11e76e0746acfbec03b0,CmRbAAAAzgEUAIfexip_ePX2VQ29-iZMDPe-5RA4aGhFRoDkEGHrTaUON7ZGyn1r16erRSWIAhESFzycoG4Rwuw4OijSlLjKfK3_HCvC4fkrX-d6I7g7ffkIotwc1KK77UkHPMz-EhBoESHGx4Ke3H9y6XfDt9iaGhSGqEI5BmtpMu4URYwFmrT-2npHWQ
ChIJsZ2B5CxYwokRL8NrRMKxD9Q,Estuary Marketplace,0,71,40.7622809,-74.021577,1b6c2bd55b1bbabe3aac0d1eb2153e812c1026fd,CmRSAAAAO_HAcE9pNnA6r0FDp1eVsfS6jVn31DxU_JhcZNACGz9kfI5xOPjTbgM77JhJLxZPJqLgO_AnNXIIyEQV_wqO5Pr-YWJaIyhCEYA8Ene36VXgQ_90NMQwa_HNJJnWywkpEhBYQL6JbpsO0FJcOawnw5-2GhRE_QU6uh-cP-RKArGSyfEE2cxL3A
ChIJ64dzUNRZwokRIHBPnvi-eJs,Estuary Apartments,0,71,40.76240009999999,-74.0216088,8219ad9d883d8891a651bc35ff7bf7abe7c7b72a,CmRSAAAA7t8RT2MVUpaxNWBPItFU3pTQwNqXDuRY28GxZ_cywaLqOFIJ7taeXDHvGg19Y0MoIerm94HrD0iZQ_iIyoUrCKueeUETgHUGf8LQJMl9mqu784B5iIIXdf9-YrylAFJ9EhCvaowPZkGZvaEVLENy8fywGhRSEtZOm86qEyOCksoFWNJp9L_daw
ChIJWexh_9hZwokRgU6sf8pvbUg,Hoboken Weehawken Notary Public Dorian Cattani,0,71,40.762501,-74.021528,11be9c0505e8db6a7fcaea5fd4003d336b8667f2,CmRRAAAA9hteDDXN0TehPTxfnx6LxRjZTVHZOyOgbE1GJ42XOzB2v6htFgfliFz39e6llMHSlqvFS3uBiIKuH9bNu9qIfeSD8LlrSW5UXKP4U7sN-Zi1-IWH5QJw5S2hQCZHaNfeEhAT4Y9Q2hC0uG0vVi7CJPuVGhQPb5J4VpGisRfvRfiqEyPgTR8F5w
ChIJRaQT-CxYwokRlEhCGnjf6fs,EcoPure Home Cleaning Service Weehawken,0,72,40.76302460000001,-74.0218549,33800368abbf808a39da3e805c685b245e5baac4,CmRSAAAAyod24y6jY17fcd2b2mk6qgIoN_KWOCNxEN1zDniW9n7RHoWTm-MPXFN6N77XwYzORC-WarFmyP9jULhochuKcXcYP2y2ni7SWFviXVXOBtxvYVlHmfsyHctEvBy_GxKbEhB5ihU8my5kq4lQpQrwGIrQGhT-0cYHqRnMImt55xhU7CBDPPlA1w
ChIJFU-XRyxYwokRo5Do2LLO_mE,New Jersey 495,0,73,40.76429109999999,-74.0220365,23e5baf02ae51b7aeaf94efa56624c064bdbb456,CmRbAAAA_30BmNHIeCFJ7SVhLHWIbSa8llsfKQfYpVRC8X0feMrRlQ9ih9_vKtBfq-KRC6lcagEYQCBRdCfDob2divgzQEbrVkc9dR4v3oIfdyc5l9mlRyTnl1fOBxSeR8xMz78sEhD8FAIlIOza3aIMvqsdfjzrGhTj2R00Cv-lzmKIFEwLumKKd8g1cQ
The current code I'm trying:
import pandas as pd
import numpy as np
import numpy.random as rdm
import matplotlib.pyplot as plt
import math as m
data = np.genfromtxt('PlacesFinalNew.csv', delimiter = ',')
size = int(data.size/8)
dict = {'LOCATIONID':[], 'LOCATION':[], 'X':[],'Y':[],'LAT':[],'LNG':[],'ID':[],'REF':[]}
for i in range(size):
dict['LOCATIONID'].append(data[i][0])
dict['LOCATION'].append(data[i][1])
dict['X'].append(data[i][2])
dict['Y'].append(data[i][3])
dict['LAT'].append(data[i][4])
dict['LNG'].append(data[i][5])
dict['ID'].append(data[i][6])
dict['REF'].append(data[i][7])
This works for every key except for Location. When I print out dict['LOCATION'] I get a list of nan. Could someone please point out the problem to me?
You can use zip:
import csv
headers = ['LOCATIONID', 'LOCATION', 'X', 'Y', 'LAT', 'LNG', 'REF', 'ID']
with open('filename.csv') as f:
data = [dict(zip(headers, i)) for i in csv.reader(f)]
final_results = {i:[c[i] for c in data] for i in headers}
import json
print(json.dumps(final_results, indent=4))
Output:
{
"LOCATIONID": [
"ChIJWzKeTARawokRjoqe_C9poOA",
"ChIJDaBbDPlZwokRuHXkJGK9ER0",
"ChIJQX6hfdlZwokRoCdPrD-8wSY",
"ChIJi3MPPNNZwokRDu8nE0ldrwk",
"ChIJq35iOdNZwokRKvT9_KFP8jg",
"ChIJ07hkOtNZwokRfY7FzKrxTkA",
"ChIJnYuB1ixYwokR3w26K1mr3JQ",
"ChIJk7Y29CxYwokRLcD4dsTPCCM",
"ChIJsZ2B5CxYwokRL8NrRMKxD9Q",
"ChIJ64dzUNRZwokRIHBPnvi-eJs",
"ChIJWexh_9hZwokRgU6sf8pvbUg",
"ChIJRaQT-CxYwokRlEhCGnjf6fs",
"ChIJFU-XRyxYwokRo5Do2LLO_mE"
],
"LOCATION": [
"THEGIRLSOUTFITS",
"NYNJ State Line",
"Shipyard Marina",
"Lincoln Harbor",
"Lincoln Harbor",
"852858 Harbor Blvd",
"1500 Harbor Blvd",
"800816 Waterfront Terrace",
"Estuary Marketplace",
"Estuary Apartments",
"Hoboken Weehawken Notary Public Dorian Cattani",
"EcoPure Home Cleaning Service Weehawken",
"New Jersey 495"
],
"X": [
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0"
],
"Y": [
"17",
"29",
"59",
"68",
"68",
"68",
"69",
"71",
"71",
"71",
"71",
"72",
"73"
],
"LAT": [
"40.71380629999999",
"40.7245721",
"40.751581",
"40.759708",
"40.75984079999999",
"40.7599265",
"40.7609258",
"40.7629756",
"40.7622809",
"40.76240009999999",
"40.762501",
"40.76302460000001",
"40.76429109999999"
],
"LNG": [
"-74.0222972",
"-74.02168689999999",
"-74.021287",
"-74.02225",
"-74.0223211",
"-74.02213569999999",
"-74.02135559999999",
"-74.0223474",
"-74.021577",
"-74.0216088",
"-74.021528",
"-74.0218549",
"-74.0220365"
],
"REF": [
"dc1885d8c12ac669e9de3a73fedfb08c40deefd3",
"3d2cfc0a35b583760187c2369bb957b5a8cd9755",
"5f5e1aa75a39ce1880085d5094694a9ceda1ff43",
"a3192a26d0a5a779da702a7f871612992e26f606",
"38ea8de2bbf6d8a710bec3104d180d71b3d00735",
"29cfd2338edc10f94f5464b2fe152ab7bab3fca6",
"ed57711af76a2534406a70f21d8a7119170b0f72",
"727cf2af87d28ec3a45f11e76e0746acfbec03b0",
"1b6c2bd55b1bbabe3aac0d1eb2153e812c1026fd",
"8219ad9d883d8891a651bc35ff7bf7abe7c7b72a",
"11be9c0505e8db6a7fcaea5fd4003d336b8667f2",
"33800368abbf808a39da3e805c685b245e5baac4",
"23e5baf02ae51b7aeaf94efa56624c064bdbb456"
],
"ID": [
"CmRSAAAA4Mvy3PVnmnYQKIZg-VX9-UrnrhlwOs7YFeY40gdXw1JfuUeEZndlhWxoIdI2K0nt1voCQDg2mqUVKV6EBgOKMTrHjDsMHOy7MJYBCUWguzLroifP1UMWTYSfUJD6E6sMEhB3psbJCGcDh5iS1PHyX4k5GhRBXigoOyADpURVG3NG2bIqXJ2i_g",
"CmRRAAAAN25LD8IWAiu8CAIFM2eMS7eJVReV_UHzmR6sbaRkhDlJgiX-wK0Xlb85hC1qmDkarwQrGoM2u60zfSMr1MxcyzMAg3UDgL6nYwwu3WgG1aNlbpiqR2HlLFlK5XhimqI4EhDrfqXHOxKeztoT01Aicp3hGhSETo6Pj7gTk-dajA0RWco1djlrVA",
"CmRRAAAAp_VV6L2SQgnjC_TeMYRe_m2EYyjo_IdAOUqSeuRM9kbEMHSyguWco9jDV6rdYxk5s1E5s21BX0P7flWfK2LCegcKi1m_AOp2LuYtuzv0ql4-Jeq6-yx50Z180J9U4nrHEhAihnIm2i8MWwk7ATToiLBgGhR-w02BFnO9EGniCms-XWq00VrGOg",
"CmRRAAAAGcYQNoc-WgRCwh3rlBzeEmlG_XQgk_coHa4qUpzWU_9DiDWnU8eLjhInrZqpR7nKibXc0dcZIYiZEb-ehKvIrF6OfbLGrpmbf47YNaofAebceKirPZ_g6jMsc-_dcITaEhAKqkWagpXbgd3CcQf8Rtz-GhQtzlAGJlP7gOzV2KfXRGFIF2ZbAA",
"CmRRAAAASNnGG2MHHIDGcXfgH-8iWUQOEahJdMZz6dy24azZrqyXqEtMb_yP1MJRGMV6Cp6lX05MgU2vNIfHcOGBfXiG0yWU7Qe0-t9_Uhm7JxNuAOEge9ZaCwNlrtxuxr8xSYsJEhAkL9vA29DV4nReD4e85D1TGhTbZYRfvNNRZi5a3NMUA0gyMYoR_A",
"CmRbAAAAdlTDQx7qWhNnvgDQ_1afttKRpJ_MzBULrJLNX0TFEwpvKTjCvCZoXfV1F0A_diKH5D8qDjjucrjUC3gV8iVxqOYoHRGYwifE2FSiVycUrjunQxiDYAd-C6Q7QOqswAomEhAewuH-Q09vSAW_hNbQ0AUOGhQqRXCRrgxc6EAl2xRv1Z73M4MMug",
"CmRbAAAAbPsHkeqeX3cx5MdED0Ao6ic0PE56xhGMdda0zXolIU7KsPmgPbT1CjXfyZ0p4ws_GYgdORZB3qP0idFvWBtMpdYpXCq0VknITsvfAoNfwYXoccYGsP7QvfpBPb1oTnywEhDwyrKX9JiQkYx9YaIXhkOwGhRCtFkL2futi7BkUxo_dMYEtsWVWA",
"CmRbAAAAzgEUAIfexip_ePX2VQ29-iZMDPe-5RA4aGhFRoDkEGHrTaUON7ZGyn1r16erRSWIAhESFzycoG4Rwuw4OijSlLjKfK3_HCvC4fkrX-d6I7g7ffkIotwc1KK77UkHPMz-EhBoESHGx4Ke3H9y6XfDt9iaGhSGqEI5BmtpMu4URYwFmrT-2npHWQ",
"CmRSAAAAO_HAcE9pNnA6r0FDp1eVsfS6jVn31DxU_JhcZNACGz9kfI5xOPjTbgM77JhJLxZPJqLgO_AnNXIIyEQV_wqO5Pr-YWJaIyhCEYA8Ene36VXgQ_90NMQwa_HNJJnWywkpEhBYQL6JbpsO0FJcOawnw5-2GhRE_QU6uh-cP-RKArGSyfEE2cxL3A",
"CmRSAAAA7t8RT2MVUpaxNWBPItFU3pTQwNqXDuRY28GxZ_cywaLqOFIJ7taeXDHvGg19Y0MoIerm94HrD0iZQ_iIyoUrCKueeUETgHUGf8LQJMl9mqu784B5iIIXdf9-YrylAFJ9EhCvaowPZkGZvaEVLENy8fywGhRSEtZOm86qEyOCksoFWNJp9L_daw",
"CmRRAAAA9hteDDXN0TehPTxfnx6LxRjZTVHZOyOgbE1GJ42XOzB2v6htFgfliFz39e6llMHSlqvFS3uBiIKuH9bNu9qIfeSD8LlrSW5UXKP4U7sN-Zi1-IWH5QJw5S2hQCZHaNfeEhAT4Y9Q2hC0uG0vVi7CJPuVGhQPb5J4VpGisRfvRfiqEyPgTR8F5w",
"CmRSAAAAyod24y6jY17fcd2b2mk6qgIoN_KWOCNxEN1zDniW9n7RHoWTm-MPXFN6N77XwYzORC-WarFmyP9jULhochuKcXcYP2y2ni7SWFviXVXOBtxvYVlHmfsyHctEvBy_GxKbEhB5ihU8my5kq4lQpQrwGIrQGhT-0cYHqRnMImt55xhU7CBDPPlA1w",
"CmRbAAAA_30BmNHIeCFJ7SVhLHWIbSa8llsfKQfYpVRC8X0feMrRlQ9ih9_vKtBfq-KRC6lcagEYQCBRdCfDob2divgzQEbrVkc9dR4v3oIfdyc5l9mlRyTnl1fOBxSeR8xMz78sEhD8FAIlIOza3aIMvqsdfjzrGhTj2R00Cv-lzmKIFEwLumKKd8g1cQ"
]
}
If your CSV file contain Header name, then just by reading csv which will create pandas DataFrame Object will convert Header into Dictionary key.
e.g;-
df = pd.read_csv(fname)
df['ColumnName']
import pandas as pd
import numpy as np
import numpy.random as rdm
import matplotlib.pyplot as plt
import math as m
df = pd.read_csv('test.csv', delimiter = ',',na_values="nan")
print(df['LOCATIONID']) # Gives you Columns Data for 'LOCATIONID'
print(df['LOCATION'])
You can read each row into a dictionary using csv.DictReader():
import csv
fieldnames = ['LOCATIONID', 'LOCATION', 'X', 'Y', 'LAT', 'LNG', 'REF', 'ID']
with open('data.csv') as in_file:
csv_reader = csv.DictReader(in_file, fieldnames=fieldnames)
for row in csv_reader:
# print out row info
# e.g. row['LOCATION']
Which basically maps each row to a collections.OrderedDict(), which is just an ordered dictionary.
If you want to map the row information to a final dictionary, you can use a collections.defaultdict():
import csv
from collections import defaultdict
fieldnames = ['LOCATIONID', 'LOCATION', 'X', 'Y', 'LAT', 'LNG', 'REF', 'ID']
row_map = defaultdict(list)
with open('data.csv') as in_file:
csv_reader = csv.DictReader(in_file, fieldnames=fieldnames)
for row in csv_reader:
for field in row:
row_map[field].append(row[field])

Categories

Resources