Is there an algorithm to detect the data types of each column of a file or dataframe? The challenge is to suggest the data type by having wrong, missing or unnormalized data. I want to detect the data types named in the keys. My first try was to use messytables but the result is really bad without normalizing the data before. So maybe there is an algorithm to get better results for an type suggestion or a way to normalize the data without knowning the data. The result should match the keys from the dataframe.
import pandas as pd
from messytables import CSVTableSet, type_guess
data = {
"decimals": ["N.A", "", "111", "111.00", "111,12", "11,111.34"],
"dates1": ["N.A.", "", "02/17/2009", "2009/02/17", "February 17, 2009", "2014, Feb 17"],
"dates2": ["N.A.", "", "02/17/2009", "2009/02/17", "02/17/2009", "02/17/2009"],
"dates3": ["N.A.", "", "2009/02/17", "2009/02/17", "2009/02/17", "2009/02/17"],
"strings": ["N.A.", "", "N.A.", "N.A.", "test", "abc"],
"integers": ["N.A.", "", "1234", "123123", "2222", "0"],
"time": ["N.A.", "", "05:41:12", "05:40:12", "05:41:30", "06:41:12"],
"datetime": ["N.A.", "", "10/02/2021 10:39:24", "10/02/2021 10:39:24", "10/02/2021 10:39:24", "10/02/2021 10:39:24"],
"boolean": ["N.A.", "", "True", "False", "False", "False"]
}
df = pd.DataFrame(data)
towrite = io.BytesIO()
df.to_csv(towrite) # write to BytesIO buffer
towrite.seek(0)
rows = CSVTableSet(towrite).tables[0]
types = type_guess(rows.sample)
print(types) # [Integer, Integer, String, String, Date(%Y/%m/%d), String, Integer, String, Date(%d/%m/%Y %H:%M:%S), Bool]
Here is my take on your interesting question.
With the dataframe you provided, here is one way to do it:
# For each main type, define a lambda helper function which returns the number of values in the given column of said type
helpers = {
"float": lambda df, col: df[col]
.apply(lambda x: x.replace(".", "").isdigit() and "." in x)
.sum(),
"integer": lambda df, col: df[col].apply(lambda x: x.isdigit()).sum(),
"datetime": lambda df, col: pd.to_datetime(
df[col], errors="coerce", infer_datetime_format=True
)
.notna()
.sum(),
"bool": lambda df, col: df[col].apply(lambda x: x == "True" or x == "False").sum(),
}
# Iterate on each column of the dataframe and get the type with maximum number of values
df_dtypes = {}
for col in df.columns:
results = {key: helper(df, col) for key, helper in helpers.items()}
best_result = max(results, key=results.get)
df_dtypes[col] = best_result if max(results.values()) else "string"
print(df_dtypes)
# Output
{
"decimals": "float",
"dates1": "datetime",
"dates2": "datetime",
"dates3": "datetime",
"strings": "string",
"integers": "integer",
"time": "datetime",
"datetime": "datetime",
"boolean": "bool",
}
Related
I read csv file into a dataframe named df
Each rows contains str below.
'{"id":2140043003,"name":"Olallo Rubio",...}'
I would like to extract "name" and "id" from each row and make a new dataframe to store the str.
I use the following codes to extract but it shows an error. Please let me know if there is any suggestions on how to solve this problem. Thanks
JSONDecodeError: Expecting ',' delimiter: line 1 column 32 (char 31)
text={
"id": 2140043003,
"name": "Olallo Rubio",
"is_registered": True,
"chosen_currency": 'Null',
"avatar": {
"thumb": "https://ksr-ugc.imgix.net/assets/019/223/259/16513215a3869caaea2d35d43f3c0c5f_original.jpg?w=40&h=40&fit=crop&v=1510685152&auto=format&q=92&s=653706657ccc49f68a27445ea37ad39a",
"small": "https://ksr-ugc.imgix.net/assets/019/223/259/16513215a3869caaea2d35d43f3c0c5f_original.jpg?w=160&h=160&fit=crop&v=1510685152&auto=format&q=92&s=0bd2f3cec5f12553e679153ba2b5d7fa",
"medium": "https://ksr-ugc.imgix.net/assets/019/223/259/16513215a3869caaea2d35d43f3c0c5f_original.jpg?w=160&h=160&fit=crop&v=1510685152&auto=format&q=92&s=0bd2f3cec5f12553e679153ba2b5d7fa"
},
"urls": {
"web": {
"user": "https://www.kickstarter.com/profile/2140043003"
},
"api": {
"user": "https://api.kickstarter.com/v1/users/2140043003?signature=1531480520.09df9a36f649d71a3a81eb14684ad0d3afc83e03"
}
}
}
def extract(text,*args):
list1=[]
for i in args:
list1.append(text[i])
return list1
print(extract(text,'name','id'))
# ['Olallo Rubio', 2140043003]
Here's what I came up with using pandas.json_normalize():
import pandas as pd
sample = [{
"id": 2140043003,
"name":"Olallo Rubio",
"is_registered": True,
"chosen_currency": None,
"avatar":{
"thumb":"https://ksr-ugc.imgix.net/assets/019/223/259/16513215a3869caaea2d35d43f3c0c5f_original.jpg?w=40&h=40&fit=crop&v=1510685152&auto=format&q=92&s=653706657ccc49f68a27445ea37ad39a",
"small":"https://ksr-ugc.imgix.net/assets/019/223/259/16513215a3869caaea2d35d43f3c0c5f_original.jpg?w=160&h=160&fit=crop&v=1510685152&auto=format&q=92&s=0bd2f3cec5f12553e679153ba2b5d7fa",
"medium":"https://ksr-ugc.imgix.net/assets/019/223/259/16513215a3869caaea2d35d43f3c0c5f_original.jpg?w=160&h=160&fit=crop&v=1510685152&auto=format&q=92&s=0bd2f3cec5f12553e679153ba2b5d7fa"
},
"urls":{
"web":{
"user":"https://www.kickstarter.com/profile/2140043003"
},
"api":{
"user":"https://api.kickstarter.com/v1/users/2140043003?signature=1531480520.09df9a36f649d71a3a81eb14684ad0d3afc83e03"
}
}
}]
# Create datafrane
df = pd.json_normalize(sample)
# Select columns into new dataframe.
df1 = df.loc[:, ["name", "id",]]
Check df1:
Input:
print(df1)
Output:
name id
0 Olallo Rubio 2140043003
I have a list
list1= ['{"bank_name": null, "country": null, "url": null, "type": "Debit", "scheme": "Visa", "bin": "789452"}\n',
'{"prepaid": "", "bin": "123457", "scheme": "Visa", "type": "Debit", "bank_name": "Ohio", "url": "www.u.org", "country": "UKs"}\n']
I passed it into a dataframe:
df = pd.DataFrame({'bincol':list1})
print(df)
bincol
0 {"bank_name": null, "country": null, "url": nu...
1 {"prepaid": "", "bin": "123457", "scheme": "Vi...
I am trying to split bincol columns into new columns using this function
def explode_col(df, column_value):
df = df.dropna(subset=[column_value])
if isinstance(df[str(column_value)].iloc[0], str):
df[column_value] = df[str(column_value)].apply(ast.literal_eval)
expanded_child_df = (pd.concat({i: json_normalize(x) for i, x in .pop(str(column_value)).items()}).reset_index(level=1,drop=True).join(df, how='right', lsuffix='_left', rsuffix='_right').reset_index(drop=True))
expanded_child_df.columns = map(str.lower, expanded_child_df.columns)
return expanded_child_df
df2 = explode_col(df,'bincol')
But i am getting this error, am i missing something here ?
raise ValueError(f'malformed node or string: {node!r}')
ValueError: malformed node or string: <_ast.Name object at 0x7fd3aa05c400>
For me working in your sample data json.loads for convert data to dictionaries, then is used json_normalize for DataFrame:
import json
df = pd.json_normalize(df['bincol'].apply(json.loads))
print(df)
bank_name country url type scheme bin prepaid
0 None None None Debit Visa 789452 NaN
1 Ohio UKs www.u.org Debit Visa 123457
I have an excel sheet which is in the below format
I want to convert this excel sheet into JSON format using Python. each JSON object is a diagonal value and column headings in the below format.
{
"Records": [
{
"RecordId": "F1",
"Assets": [
{
"AssetId": "A1",
"Support": "S11"
},
{
"AssetId": "A2",
"Support": "S12"
},
{
"AssetId": "A3",
"Support": "S13"
}
]
},
{
"RecordId": "F2",
"Assets": [
{
"AssetId": "A1",
"Support": "S21"
},
{
"AssetId": "A2",
"Support": "S22"
},
{
"AssetId": "A3",
"Support": "S23"
}
]
}
]
}
I have written some code it seems not working as I expected.
import json
import pandas as pd
df = pd.read_excel (r'test.xlsx', sheet_name='Sheet2')
#initialize data
data=[0 for i in range(len(df))]
datac=[0 for c in range(len(df.columns))]
newset=dict()
for i in range(len(df)):
# data[i] = r'{"'+str(df.columns.values[0])+'": "' +str(df.loc[i][0])+'", '+str(df.columns.values[1])+'": "' +str(df.loc[i][1])+'", '+str(df.columns.values[2])+'": "' +str(df.loc[i][2])+'"}'
#data[i] = {str(df.columns.values[1]) : str(df.loc[i][0]), str(df.columns.values[1]): str(df.loc[i][1]), str(df.columns.values[2]): str(df.loc[i][2])}
for c in range(1,len(df.columns)):
#data[i] = {str('RecordId') : str(df.loc[i][0]),str('Assets'):[{"AssetId": str(df.columns.values[c]),"Support": str(df.loc[i][c])}]}
datac[c] = {"AssetId": str(df.columns.values[c]),"Support": str(df.loc[i][c])}
data[i]={str('RecordId') : str(df.loc[i][0]),str('Assets'):datac[c]}
print(data[i])
output_lines = [json.dumps(line)+",\n" for line in data]
output_lines[-1] = output_lines[-1][:-2] # remove ",\n" from last line
with open(r'Savedwork.json', 'w') as json_file:
json_file.writelines(output_lines)
What you need is the iterrows() method, it will iterate over the
dataframe's rows as (index, series) pairs. The columns() method will give you
the list of column names, so you'll be able to iterate over the columns in the
series, and access them by name.
import json
import pandas as pd
df = pd.read_excel('test.xlsx')
recs = []
for i, row in df.iterrows():
rec = {
'RecordId': row[0],
'Assets': [{'AssetId': c, 'Support': row[c]} for c in df.columns[1:]]
}
recs.append(rec)
out = {'Records': recs}
(yes, it could all be done in a single list comprehension, but abusing those hinders readability)
Also, you don't need to do json.dumps on lines, and then assemble them with
newlines (don't work at the text level): build a dictionary with the entire
data, and then json.dump that:
print(json.dumps(out, indent=4))
You can create the dicts directly in pandas.
First set the first column with F1, F2 as index:
df.set_index(0, inplace = True)
df.index.name = None
Then create the dicts in pandas with dict keys as column names, export it to a dict and save it to json:
import json
df = df.apply(lambda x: [{"AssetId": x.name, "Support": i} for i in x], axis =1).reset_index().rename(columns={'index': 'RecordId', 0: 'Assets'})
json_data = {"Records": df.to_dict('records')}
with open('r'Savedwork.json', 'w') as fp:
json.dump(json_data, fp)
another solution is to take a snapshot of the entire workbook in json format and reorganize it out of the box. Using the collect function of XLtoy is possible to do that via command line, this approach allows you more degrees of freedom.
[i'm the main developer of XLtoy]
I'm converting several JSON files into a CSV using the following code below, it works as intended, but it converts all of the data in the JSON file. Instead, I want it to do the following:
Load JSON file [done]
Extract certain nested data in the JSON file [wip]
Convert to CSV [done]
Current Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
dic = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Flatten and convert to a data frame
dic_flattened = (flatten(d, '.') for d in dic)
df = pandas.DataFrame(dic_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
In the example at the bottom, I only want everything under the following keys: created, emails, and identities. The rest is useless information (such as statusCode) or it's duplicated under a different key name (such as profile and userInfo).
I know it requires a for loop and if statement to specify the key names later on, but not sure the best way to implement it. This is what I have so far when I want to test it:
Attempted Code
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_file = open(file_path + '.json', 'r', encoding='utf-8', errors='ignore')
dic = json.load(json_file)
# List keys to extract
key_list = ['created', 'emails', 'identities']
for d in dic:
#print(d['identities']) #Print all 'identities'
#if 'identities' in d: #Check if 'identities' exists
if key_list in d:
# Flatten and convert to a data frame
#dic_flattened = (flatten(d, '.') for d in dic)
#df = pandas.DataFrame(dic_flattened)
else:
# Skip
# Export to CSV in the same directory with the original file name
#export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
Is this the right logic?
file_name.json Example
[
{
"callId": "abc123",
"errorCode": 0,
"apiVersion": 2,
"statusCode": 200,
"statusReason": "OK",
"time": "2020-12-14T12:00:32.744Z",
"registeredTimestamp": 1417731582000,
"UID": "_guid_abc123==",
"created": "2014-12-04T22:19:42.894Z",
"createdTimestamp": 1417731582000,
"data": {},
"preferences": {},
"emails": {
"verified": [],
"unverified": []
},
"identities": [
{
"provider": "facebook",
"providerUID": "123",
"allowsLogin": true,
"isLoginIdentity": true,
"isExpiredSession": true,
"lastUpdated": "2014-12-04T22:26:37.002Z",
"lastUpdatedTimestamp": 1417731997002,
"oldestDataUpdated": "2014-12-04T22:26:37.002Z",
"oldestDataUpdatedTimestamp": 1417731997002,
"firstName": "John",
"lastName": "Doe",
"nickname": "John Doe",
"profileURL": "https://www.facebook.com/John.Doe",
"age": 30,
"birthDay": 31,
"birthMonth": 12,
"birthYear": 1969,
"city": "City, State",
"education": [
{
"school": "High School Name",
"schoolType": "High School",
"degree": null,
"startYear": 0,
"fieldOfStudy": null,
"endYear": 0
}
],
"educationLevel": "High School",
"followersCount": 0,
"gender": "m",
"hometown": "City, State",
"languages": "English",
"locale": "en_US",
"name": "John Doe",
"photoURL": "https://graph.facebook.com/123/picture?type=large",
"timezone": "-8",
"thumbnailURL": "https://graph.facebook.com/123/picture?type=square",
"username": "john.doe",
"verified": "true",
"work": [
{
"companyID": null,
"isCurrent": null,
"endDate": null,
"company": "Company Name",
"industry": null,
"title": "Company Title",
"companySize": null,
"startDate": "2010-12-31T00:00:00"
}
]
}
],
"isActive": true,
"isLockedOut": false,
"isRegistered": true,
"isVerified": false,
"lastLogin": "2014-12-04T22:26:33.002Z",
"lastLoginTimestamp": 1417731993000,
"lastUpdated": "2014-12-04T22:19:42.769Z",
"lastUpdatedTimestamp": 1417731582769,
"loginProvider": "facebook",
"loginIDs": {
"emails": [],
"unverifiedEmails": []
},
"rbaPolicy": {
"riskPolicyLocked": false
},
"oldestDataUpdated": "2014-12-04T22:19:42.894Z",
"oldestDataUpdatedTimestamp": 1417731582894
"registered": "2014-12-04T22:19:42.956Z",
"regSource": "",
"socialProviders": "facebook"
}
]
As mentioned by juanpa.arrivillaga, I simply need to add the following line after the key_list:
json_list = [{k:d[k] for k in key_list} for d in json_list]
This is the full working code:
import json, pandas
from flatten_json import flatten
# Enter the path to the JSON and the filename without appending '.json'
file_path = r'C:\Path\To\file_name'
# Open and load the JSON file
json_list = json.load(open(file_path + '.json', 'r', encoding='utf-8', errors='ignore'))
# Extract data from the defined key names
key_list = ['created', 'emails', 'identities']
json_list = [{k:d[k] for k in key_list} for d in json_list]
# Flatten and convert to a data frame
json_list_flattened = (flatten(d, '.') for d in json_list)
df = pandas.DataFrame(json_list_flattened)
# Export to CSV in the same directory with the original file name
export_csv = df.to_csv (file_path + r'.csv', sep=',', encoding='utf-8', index=None, header=True)
I want process multiple json records one after the other. My code reads the multiple jsons and stores them into dataframe. Now i want to process the json document row by row from dataframe. When i take the row from dataframe i need to convert that single row to dataframe again and do some operations on that. I am stuck at to convert class 'pyspark.sql.types.Row' object to dataframe.
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://127.0.0.1/mydatabase.sample").load()
A = u(funcRowIter,df.schema)
z = df.withColumn("new_column",A(struct([df[x] for x in df.columns])))
z.show()
def funcRowIter(rows):
print type(rows)
if(rows is not None):
rdf = sqlContext.createDataFrame(rows)
rdf.show()
return rows
Help me out to convert the class 'pyspark.sql.types.Row' object to dataframe. My row object is huge json file.
This is the json i am trying to read from mongodb
{
"Feed": {
"feedBody": {
"Reservation": {
"recordLocatorID": "X23344",
"pnrCreateDate": "2018-09-24T23:00:00.000",
"lastUpdateTimestamp": "2018-09-26T14:51:01.643",
"pnrReservationSystemSequenceID": "1643",
"pnrPurgeDate": "2018-10-11",
"passengerCount": "1",
"reservationSystemCode": "1X",
"passengerList": {
"passenger": {
"passengerID": "2",
"lastUpdateTimestamp": "2018-09-24T18:00:54.835",
"dateOfBirth": "1993-10-02",
"givenName": "fgdfg",
"surName": "fgdfg",
"gender": "M",
"infantIndicator": "true",
"seatCount": "1",
"reservationSystemCustomerID": "dfgdfg",
"passengerTypeCode": "dfgfd",
"groupDepositIndicator": "false",
"passengerTicketDocList": {
"passengerTicketDoc": {
"ticketDocID": "45",
"lastUpdateTimestamp": "2018-09-24T18:01:01.149",
"ticketNumber": "43434343434",
"ticketType": "T",
"ticketIndicator": "E",
"status": "T",
"issuanceDate": "2010-09-20",
"chargeAmount": "0.74",
"currency": "USD"
}
}
}
}
}
}
}
}
This is the rows output
Row(Feed=Row(
feedBody=Row(
Reservation=Row(
recordLocatorID=u'X23344',
pnrCreateDate=u'2018-09-24T23:00:00.000',
lastUpdateTimestamp=u'2018-09-26T14:51:01.643',
pnrReservationSystemSequenceID=u'1643',
pnrPurgeDate=u'2018-10-11',
passengerCount=u'1',
reservationSystemCode=u'1X',
passengerList=Row(
passenger=Row(
passengerID=u'2',
lastUpdateTimestamp=u'2018-09-24T18:00:54.835',
dateOfBirth=u'1993-10-02',
givenName=u'fgdfg',
surName=u'fgdfg',
gender=u'M',
infantIndicator=u'true',
seatCount=u'1',
reservationSystemCustomerID=u'dfgdfg',
passengerTypeCode=u'dfgfd',
groupDepositIndicator=u'false',
passengerTicketDocList=Row(
passengerTicketDoc=Row(
ticketDocID=u'45',
lastUpdateTimestamp=u'2018-09-24T18:01:01.149',
ticketNumber=u'43434343434',
ticketType=u'T',
ticketIndicator=u'E',
status=u'T',
issuanceDate=u'2010-09-20',
chargeAmount=u'0.74',
currency=u'USD'))))))), _id=Row(oid=u'5bc0cc8c2ec34dd42a44fc2f'))