Getting error InvalidBson in pymongo code. How to solve this? - python

import pymongo
from pymongo import MongoClient
test = MongoClient("mongodb://harikrishna:ch9u2encn4e9hc3n0284hci-032n8c4#35.200.240.21:27017/tb_dev")
testdb = test.tb_dev
import numpy as np
from datetime import datetime
import datetime as dt
import pandas as pd
from bson.objectid import ObjectId
submission = testdb.test_submission.aggregate([
{"$match":{"createdOn":{"$gte":datetime.strptime("2020-11-30T00:00:00UTC","%Y-%m-%dT%H:%M:%S%Z"),
"$lt":datetime.strptime("2021-01-01T00:00:00UTC","%Y-%m-%dT%H:%M:%S%Z")},"isQuiz":False}},
{"$project":{"sid":1,"tid":1,"marks":1,"timeTaken":{"$divide":["$timeTaken",60]},"totalAnswered":1}}],allowDiskUse = True)
submission = pd.DataFrame(list(submission))
submission.head()
Trying to execute this code. But getting InvalidBSON error in submission = pd.DataFrame(list(submission)) line. Help me in decoding this error

Related

profile = report(data) TypeError: 'module' object is not callable

import profile
import pandas as pd
from pandas_profiling import profile_report as report
data = pd.read_csv(r"ifsc-code-of-syndicate-bank-_All-India_.csv")
print(data)
#generate a report
profile = report(data)
profile.to_file(output_file="ifsc.html")
As per the official documentation of this package, you should try this import instead:
from pandas_profiling import ProfileReport as report
https://pandas-profiling.ydata.ai/docs/master/pages/getting_started/quickstart.html

Aws ParamValidationError: Parameter validation failed: Invalid type for parameter Body, value:

I am writing the code in AWS lambda and I cannot pass data
like
[0,1,0,0,0,1.....]
it says parameter validation error any solutions as to how to pass the data?
import json
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import os
import io
import boto3
import csv
import numpy as np
from preprocessing import preproprocess
ENDPOINT_NAME = os.environ['ENDPOINT_NAME']
runtime= boto3.client('runtime.sagemaker')
def lambda_handler(event, context):
data = json.loads(json.dumps(event))
payload = data['data']
print(payload)
# any_text= 'Hi sarah wondering what is the status of my order thanks soham'
# order_status='Awaiting shipment'
# eg= preproprocess(any_text,order_status)
response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
ContentType='text/csv',
Body=payload)
Here is the code to my model
%%writefile train.py
import pip
from pip._internal import main
def install(package):
"""install packages"""
if hasattr(pip, 'main'):
pip.main(['install', package])
else:
main(['install', package])
install('s3fs')
import os
import boto3
import pandas as pd
from sklearn.svm import SVC
from sklearn.externals import joblib
import s3fs
bucket_name = 'testsvc'
if __name__ == "__main__":
training_data_directory_path = "s3://testsvc/xgboost-as-a-built-in-algo/train/train.csv"
train_data=pd.read_csv(training_data_directory_path)
X_train=train_data.drop(columns=['tags'])
y_train=train_data['tags']
svc = SVC(kernel = 'rbf',C=10,gamma = 0.1)
svc.fit(X_train,y_train)
model_output_directory = os.path.join("/opt/ml/model", "model.joblib")
# boto3.Session().resource('s3').Bucket(bucket_name).Object(model_output_directory).upload_file('')
print("Saving model to {}".format(model_output_directory))
joblib.dump(svc, model_output_directory)
def model_fn(model_dir):
"""Deserialized and return fitted model
Note that this should have the same name as the serialized model in the main method
"""
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
return clf
When I am passing JSON data in test "0,1,0,0,...." I am getting an error that my data is not getting a 2d array it's a 1-d array please try to reshape it. when I am trying to change the formatting of test data there is a parameter validation error. Please help me some way around this thing.

Error: column is of type timestamp without time zone but expression is of type double precision

The following code...
import time
import numpy as np
import pyodbc
import datetime
import win32com.client
import pythoncom
import re
from sqlalchemy import create_engine, event
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
from datetime import datetime, timezone, timedelta
import threading
import os
import csv
import warnings
import xlsxwriter
from threading import Timer
import psycopg2
warnings.filterwarnings("ignore")
import time
import win32com.client
import threading
from datetime import datetime, timezone, timedelta
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
import warnings
from win32com import client
from datetime import datetime
import os
#from docx import Document
#from docx.shared import Inches, Pt, Mm
import time
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
from datetime import datetime
import os
import time
import glob
import time
import time
from datetime import datetime, timezone, timedelta
import time
import win32com.client
import threading
from datetime import datetime, timezone, timedelta
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
import warnings
from win32com import client
from datetime import datetime
import os
#from docx import Document
#from docx.shared import Inches, Pt, Mm
import time
import numpy as np
import pyodbc
import datetime
import pandas as pd
import runpy
import codecs
import collections
from multiprocessing import Process, Queue
import threading
from datetime import timedelta
import warnings
from win32com import client
from datetime import datetime
import os
import time
import glob
import time
import time
from datetime import datetime, timezone, timedelta
warnings.filterwarnings("ignore")
kronos_df = pd.read_excel(r"\\teslamotors.com\us\Public\stalamakki\ExcelFiles\KronosDataHourlyRefresh.xls")
kronos_df.fillna('')
clockRecords = kronos_df.to_dict('records')
sqlUpsert = """
INSERT INTO "daily_performance_metrics"."employee_kronos_data_2"
VALUES (%s,%s,%s,%s,%s)
"""
# VALUES (%s,to_timestamp(%s, 'YY-MM-DD HH24:MI'),COALESCE(to_timestamp(NULLIF(%s, '01/01/01 00:00'),'MM/DD/YY hh24:mi')),%s,%s)
#sqlDelete = """
# DELETE FROM "daily_performance_metrics"."employee_kronos_data" WHERE CustomerName='Alfreds Futterkiste';
postgres_conn = psycopg2.connect("host=sjc04p1scadb02.teslamotors.com dbname=service_warehouse_metrics user=service_warehouse_rw port=5432 password=gvjY96LcnWn2B3+obVjFsLG5erMy/4JNxgN00Lnq2n0=")
postgres_cursor = postgres_conn.cursor()
for record in clockRecords:
if record['ShiftEnd'] == '':
record['ShiftEnd'] = None
if record['ShiftStart'] == '':
record['ShiftStart'] = None
postgres_cursor.execute(sqlUpsert,list(record.values()))
postgres_conn.commit()
postgres_cursor.close()
postgres_conn.close()
...generates this error message when it tries to write what I assume is the first record with a null value...
---------------------------------------------------------------------------
DatatypeMismatch Traceback (most recent call last)
<ipython-input-73-2ef7c8c3820c> in <module>()
15 if record['ShiftStart'] == 'NaN':
16 record['ShiftStart'] = None
---> 17 postgres_cursor.execute(sqlUpsert,list(record.values()))
18 postgres_conn.commit()
19 postgres_cursor.close()
DatatypeMismatch: column "shift_end" is of type timestamp without time zone but expression is of type double precision
LINE 3: ... VALUES ('zvolkert','10/02/19 13:13','NaN'::flo...
^
HINT: You will need to rewrite or cast the expression.
To deal with the Nulls, I've tried this syntax for the INSERT statement...
INSERT INTO "daily_performance_metrics"."employee_kronos_data_2"
VALUES (%s,to_timestamp(%s, 'YY-MM-DD HH24:MI'),COALESCE(to_timestamp(NULLIF(%s, '01/01/01 00:00'),'MM/DD/YY hh24:mi')),%s,%s)
...which generates this error message...
InvalidTextRepresentation: invalid input syntax for type double precision: "01/01/01 00:00"
LINE 3: ...4:MI'),COALESCE(to_timestamp(NULLIF('NaN'::float, '01/01/01 ...
I'm assuming this is a very simple syntax mistake. Would really appreciate if someone could tell me the correct syntax for getting these strings and null values into the timestamp fields.
I'm writing to this table...
CREATE TABLE daily_performance_metrics.employee_kronos_data_5 (
file_number TEXT
,shift_start TIMESTAMP
,shift_end TIMESTAMP
,job_category TEXT
,job_name TEXT
)
Here's the file that i'm trying to copy to the database:
ClockInOutRecords.xlsx
Solution was simply to replace the nan values with none. I thought I was already doing that but the == 'NaN' syntax was incorrect. The correct syntax for checking for NaN is pd.isna. Fixed code below...
sqlUpsert = """
INSERT INTO "daily_performance_metrics"."employee_kronos_data_2"
VALUES (%s,COALESCE(to_timestamp(%s, 'MM/DD/YY hh24:mi')),COALESCE(to_timestamp(%s, 'MM/DD/YY hh24:mi')),%s,%s)
"""
# VALUES (%s,%s,%s,%s,%s)
# VALUES (%s,to_timestamp(%s, 'YY-MM-DD HH24:MI'),COALESCE(to_timestamp(NULLIF(%s, '01/01/01 00:00'),'MM/DD/YY hh24:mi')),%s,%s)
postgres_conn = psycopg2.connect("host=sjc04p1scadb02.teslamotors.com dbname=service_warehouse_metrics user=service_warehouse_rw port=5432 password=gvjY96LcnWn2B3+obVjFsLG5erMy/4JNxgN00Lnq2n0=")
postgres_cursor = postgres_conn.cursor()
postgres_cursor.execute(sqlDelete)
for record in clockRecords:
# print(list(record.values()))
if pd.isna(record['ShiftEnd']):
# print(record['ShiftEnd'])
record['ShiftEnd'] = None
# print(record['ShiftEnd'])
if pd.isna(record['Job_Category']):
record['Job_Category'] = None
if pd.isna(record['Job_Name']):
record['Job_Name'] = None
# print(list(record.values()))
postgres_cursor.execute(sqlUpsert,list(record.values()))
# print('success')

CQLSH COPY ERROR TypeError: 'int' object is not iterable

I have below code getting trying to dump table data but below error TypeError: 'int' object is not iterable.
import argparse
import sys
import itertools
import codecs
import uuid
import os
try:
import cassandra
import cassandra.concurrent
except ImportError:
sys.exit('Python Cassandra driver not installed. You might try \"pip install cassandra-driver\".')
from cassandra.cluster import Cluster, ResultSet
from cassandra.policies import DCAwareRoundRobinPolicy
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import ConsistencyLevel
datafile = "/Users/username/adf.csv"
if os.path.exists(datafile):
os.remove(datafile)
def dumptableascsv():
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
cluster = Cluster(['127.0.0.1'],
load_balancing_policy=DCAwareRoundRobinPolicy(local_dc='Cassandra'),
port=9042, auth_provider=auth_provider)
session = cluster.connect('qualys_ioc')
session.execute("COPY qualys_ioc.agent_delta_fragment(agent_id , delta_id , fragment_id, boolean ,created_on) TO "
"'/Users/username/adf.csv' WITH HEADER = true ;", ConsistencyLevel.QUORUM)
dumptableascsv()
COPY is a cqlsh commend -- it is not part of CQL, and cannot be executed via native protocol clients. You are getting this particular error instead of a server request error because you're passing a consistency level in the parameters position of Session.execute.
You can use cqlsh to do this from a script, or have a look at the DS Bulk tool for high performance loading and unloading.

AttributeError: type object 'MinimalFeatureExtractionSettings' has no attribute 'n_processes'

I'm trying to extract features using tsfresh package and extract_features() function.
tsfresh Version: 0.4.0.post0.dev1+ng19fa136
However, I get the following error:
AttributeError: type object 'MinimalFeatureExtractionSettings' has no
attribute 'n_processes'
Code:
import numpy as np
import pandas as pd
column_names = ['time_series1', 'time_series2','time_series3']
ts = np.random.rand(6,3)
df_to_extract = pd.DataFrame(data=ts, columns = column_names)
df_to_extract['id'] = 1
df_to_extract['time'] = np.arange(1,7)
#print(df_to_extract)
import tsfresh
from tsfresh import extract_features
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction import extract_features, MinimalFeatureExtractionSettings
from tsfresh.feature_extraction.settings import *
from tsfresh.feature_extraction.settings import FeatureExtractionSettings
import tsfresh.feature_extraction.settings
from tsfresh import utilities
from tsfresh import feature_extraction
extracted_features = extract_features(df_to_extract,
column_id="id",
column_sort="time",
parallelization= 'per_kind',
feature_extraction_settings= MinimalFeatureExtractionSettings)
Package source code: https://github.com/blue-yonder/tsfresh/blob/master/tsfresh/feature_extraction/extraction.py
I'm using Python 3.5 (Anaconda) on Win10.
I suppose it could be some kind of import error.
How to solve that issue?
Problem solved
To make it work add:
settings= MinimalFeatureExtractionSettings()
extracted_features = extract_features(df_to_extract,
column_id="id",
column_sort="time",
parallelization= 'per_kind',
feature_extraction_settings= settings)
There is no MinimalFeatureExtractionSettings object anymore. It is called MinimalFCParameters now. Thus, you would have to write the following code:
from tsfresh.feature_extraction import extract_features, MinimalFCParameters
...
minimalFCParametersForTsFresh = MinimalFCParameters()
extracted_features = extract_features(df_to_extract,column_id="id",default_fc_parameters = minimalFCParametersForTsFresh)

Categories

Resources