Why influx performance is so slow - python

I am storing some data in influx and it is quite confusing that influx is 4-5 times slow as Mysql. I try to test by inserting 10000 rows in mysql and then in influxdb.
and the stats are below.
Mysql
real: 6m 39sec
user: 2.956sec
sys: 0.504sec
Influxdb
real: 6m 17.193sec
user: 11.860sec
sys: 0.328sec
my code for influx is given below, I used the same pattern to store in mysql.
#!/usr/bin/env python
# coding: utf-8
import time
import csv
import sys
import datetime
import calendar
import pytz
from influxdb import client as influxdb
from datetime import datetime
host = 'localhost'
port = 8086
user = "admin"
password = "admin"
db_name = "testdatabase"
db = influxdb.InfluxDBClient(database=db_name)
def read_data():
with open(file) as f:
reader = f.readlines()[4:]
for line in reader:
yield (line.strip().split(','))
fmt = '%Y-%m-%d %H:%M:%S'
file = '/home/rob/mycsvfile.csv'
csvToInflux = read_data()
body = []
for metric in csvToInflux:
timestamp = datetime.strptime(metric[0][1: len(metric[0]) - 1], fmt)
new_value = float(metric[1])
body.append({
'measurement': 'mytable1',
'time': timestamp,
'fields': {
'col1': metric[1],
'col2': metric[2],
'col3': metric[3],
'col4': metric[4],
'col5': metric[5],
'col6': metric[6],
'col7': metric[7],
'col8': metric[8],
'col9': metric[9]
}
})
db.write_points(body)
Can someone give me an idea how can I improve it. I think it might be due to cache. is cache option is off by default in Influx db? and can someone guide me to do batch processing in influx. I try to look over SO and google but couldn't solve my problem. I am newbie to influx db. I am trying to make it faster.
Thanks for any help or tips.

Inserting one by one into influxdb is slow, you should do it in batches. For example, trying with a CSV of 10000 lines (one by one):
with open('/tmp/blah.csv') as f:
lines = f.readlines()
import influxdb
inf = influxdb.InfluxDBClient('localhost', 8086, 'root', 'root', 'example1')
for line in lines:
parts = line.split(',')
json_body = [{
'measurement': 'one_by_one',
'time': parts[0],
'fields':{
'my_value': int(parts[1].strip())
}
}]
inf.write_points(json_body)
This gives me a result of
└─ $ ▶ time python influx_one.py
real 1m43.655s
user 0m19.547s
sys 0m3.266s
And doing a small change to insert all the lines of the CSV in one go:
json_body = []
for line in lines:
parts = line.split(',')
json_body.append({
'measurement': 'one_batch',
'time': parts[0],
'fields':{
'my_value': int(parts[1].strip())
}
})
inf.write_points(json_body)
The result is much much better:
└─ $ ▶ time python influx_good.py
real 0m2.693s
user 0m1.797s
sys 0m0.734s

Related

How to write a dataframe to dynamodb using AWS Lambda

I'M having a Lambda function set up in AWS Cloudformation. The runtime is python3.8.
The purpose is to pull some weather data from an API and write it to DynamoDB once a day.
So far the Lambda Test on AWS checks out, all green ...but the function doesnt write any values to the dynamodb.
Is there an error in indenting maybe?
Here is the code:
import boto3
import pyowm
import time
import json
import requests
from datetime import datetime, date, timedelta, timezone
import pandas as pd
from geopy.geocoders import Nominatim
def lambda_handler(event, context):
api_key = "xxxxxxx" #Enter your own API Key
owm = pyowm.OWM(api_key)
city = 'Berlin, DE'
geolocator = Nominatim(user_agent='aerieous#myserver.com')
location = geolocator.geocode(city)
lat = location.latitude
lon = location.longitude
# set the date to pull the data from to yesterday
# format = '2021-09-09 00:00:00'
x = (datetime.now() - timedelta(days = 1 ))
d = x.isoformat(' ', 'seconds')
# convert time to epoch
p = '%Y-%m-%d %H:%M:%S'
dt = int(time.mktime(time.strptime(d,p)))
url = "https://api.openweathermap.org/data/2.5/onecall/timemachine?lat=%s&lon=%s& dt=%s&appid=%s&units=metric" % (lat, lon, dt, api_key)
response = requests.get(url)
data_history = json.loads(response.text)
# here we flatten only the nested list "hourly"
df_history2 = pd.json_normalize(data_history, record_path='hourly', meta=['lat', 'lon', 'timezone'],
errors='ignore')
# convert epoch to timestamp
df_history2['dt'] = pd.to_datetime(df_history2['dt'],unit='s').dt.strftime("%m/%d/%Y %H:%M:%S")
# replace the column header
df_history2 = df_history2.rename(columns={'dt': 'timestamp'})
df_history2['uuid'] = df_history2[['timestamp','timezone']].agg('-'.join, axis=1)
df_select_hist2 = df_history2[['uuid','lat','lon', 'timezone', 'timestamp', 'temp', 'feels_like', 'humidity', 'pressure']]
df_select_hist2 = df_select_hist2.astype(str)
df_select_hist2
content = df_select_hist2.to_dict('records')
return content
dynamodb = boto3.resource(
'dynamodb',
aws_access_key_id='xx',
aws_secret_access_key='xx',
region_name='eu-west-1')
table = dynamodb.Table("Dev_Weather")
for item in content:
uuid = item['uuid']
timezone = item['timezone']
timestamp = item['timestamp']
lat = item['lat']
lon = item['lon']
temp = item['temp']
feels_like = item['feels_like']
humidity = item['humidity']
pressure = item['pressure']
table.put_item(
Item={
'pk_id': uuid,
'sk': timestamp,
'gsi_1_pk': lat,
'gsi_1_sk': lon,
'gsi_2_pk': temp,
'gsi_2_sk': feels_like,
'humidity': humidity,
'pressure': pressure,
'timezone': timezone
}
)
Thank you for any help in advance.
A
The line return content ends your lambda function. It basically tells the script: I'm done and this is the result. Nothing after it is executed. Remove the line to be able to execute code afterwards. Also, the indentation in your code example seems off (a space too little when starting the dynamodb stuff), so I'm a bit confused over why this wouldn't give syntax errors.
Also: there is no need to specify an access key, region etc. when creating the dynamodb resource. It's fetched by lambda automatically. Just make sure the lambda role has the right permissions to call dynamodb.

Get all Salesforce data lead in Python

I want to get all the data of a lead object
and I did this script to obtain the data of a lead
and i only get an amount of 2000.
What do I have to do to get all the amount in
Salesforce of an object by pyhon?
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"Id",
"Email",
"Company",
"Address",
"Phone",
"FirstName",
"LastName",
"CreatedDate",
]
sosl = 'SELECT {0[0]}, {0[1]}, {0[2]}, {0[3]}, {0[4]}, {0[5]}, {0[6]} , {0[7]} , {0[8]} FROM Lead '.format(
columns)
# Data acquisition with SOSL
data = sf.query(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="") as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
d = datetime.strptime(
data['CreatedDate'], '%Y-%m-%dT%H:%M:%S.%f%z')
data['CreatedDate'] = d.strftime('%Y-%m-%d %H:%M:%S')
writer.writerow(data)
if __name__ == '__main__':
main()
If anyone knows, please let me know.
You can obtain all responsive records via the query_all() or query_all_iter() methods, which are documented under Queries in the simple_salesforce documentation.
Note that the query you are running is SOQL, not SOSL. SOSL is for full-text search.

Transfer csv to elasticsearch from Python with document_id as csv field

wanted to transfer following csv to elsticsearch
|hcode|hname|
|1|aaaa|
|2|bbbbb|
|3|ccccc|
|4|dddd|
|5|eeee|
|6|ffff|
and need to insert hcode field as document_id. getting below error
File "C:\Users\Namali\Anaconda3\lib\site-packages\elasticsearch\connection\base.py", line 181, in _raise_error
status_code, error_message, additional_info
RequestError: RequestError(400, 'mapper_parsing_exception', 'failed to parse')"
use elasticseach version is 7.1.1 and python vervion is 3.7.6
Python code-----------------------------------------------------------------
import csv
import json
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def csv_reader(file_obj, delimiter=','):
reader_ = csv.reader(file_obj,delimiter=delimiter,quotechar='"')
i = 1
results = []
for row in reader_:
#try :
#es.index(index='hb_hotel_raw', doc_type='hb_hotel_raw', id=row[0],
# body=json.dump([row for row in reader_], file_obj))
es.index(index='test', doc_type='test', id=row[0],body=json.dumps(row))
#except:
# print("error")
i = i + 1
results.append(row)
print(row)
if __name__ == "__main__":
with open("D:\\namali\\rez\\data_mapping\\test.csv") as f_obj:
csv_reader(f_obj)
First, doc_type is omitted in the elasticsearch 7. Second, you need to pass a valid json to elasticsearch. I edited your code as below:
for row in reader_:
_id = row[0].split("|")[1]
text = row[0].split("|")[2]
my_dict = {"hname" : text}
es.index(index='test', id=_id, body=my_dict)
<disclosure: I'm the developer of Eland and employed by Elastic>
If you're willing to load the CSV into a Pandas DataFrame you can use Eland to create/append the tabular data to an Elasticsearch index with all data types resolved properly.
I would recommend reading pandas.read_csv() and eland.pandas_to_eland() function documentation for ideas on how to accomplish this.

How do I create a CSV in Lambda using Python?

I would like to create a report in Lambda using Python that is saved in a CSV file. So you will find the code of the function:
import boto3
import datetime
import re
def lambda_handler(event, context):
client = boto3.client('ce')
now = datetime.datetime.utcnow()
end = datetime.datetime(year=now.year, month=now.month, day=1)
start = end - datetime.timedelta(days=1)
start = datetime.datetime(year=start.year, month=start.month, day=1)
start = start.strftime('%Y-%m-%d')
end = end.strftime('%Y-%m-%d')
response = client.get_cost_and_usage(
TimePeriod={
'Start': "2019-02-01",
'End': "2019-08-01"
},
Granularity='MONTHLY',
Metrics=['BlendedCost'],
GroupBy=[
{
'Type': 'TAG',
'Key': 'Project'
},
]
)
How can I create a CSV file from it?
Here is a sample function to create a CSV file in Lambda using Python:
Assuming that the variable 'response' has the required data for creating the report for you, the following piece of code will help you create a temporary CSV file in the /tmp folder of the lambda function:
import csv
temp_csv_file = csv.writer(open("/tmp/csv_file.csv", "w+"))
# writing the column names
temp_csv_file.writerow(["Account Name", "Month", "Cost"])
# writing rows in to the CSV file
for detail in response:
temp_csv_file.writerow([detail['account_name'],
detail['month'],
detail['cost']
])
Once you have created the CSV file, you can upload it S3 and send it as an email or share it as link using the following piece of code:
client = boto3.client('s3')
client.upload_file('/tmp/csv_file.csv', BUCKET_NAME,'final_report.csv')
Points to remember:
The /tmp is a directory storage of size 512 MB which can be used to store a few in memory/ temporary files
You should not rely on this storage to maintain state across sub-sequent lambda functions.
The above answer by Repakula Srushith is correct but will be creating an empty csv as the file is not being closed. You can change the code to
f = open("/tmp/csv_file.csv", "w+")
temp_csv_file = csv.writer(f)
temp_csv_file.writerow(["Account Name", "Month", "Cost"])
# writing rows in to the CSV file
for detail in response:
temp_csv_file.writerow([detail['account_name'],
detail['month'],
detail['cost']
])
f.close()

Appending data to previous row of a CSV using Python

I'm working on a Python script that takes Nessus data exported as CSV and removes duplicate data, however due to the way the exporting works results for different ports and protocols have their own unique row, even though all the other data in the row is the same. I need to remove these duplicates, but I want to keep the Port and Protocol column data and append it to the previous row.
Here is a very small CSV I'm using to test and build the script:
Screenshot of CSV File
As you can see all fields are the exact same apart from the port field and sometimes the protocol field will be different too, so I need to read both rows of the CSV file and then append the port like this: 80, 443 and the same with protocol: tcp, tcp
Then only save the one line to remove duplicate data, I have tried doing this by checking if there has already been an instance of the Plugin ID, however my output is only printing the second rows Port and Protocol.
protocollist = []
portlist = []
pluginid_list = []
multiple = False
with open(csv_file_input, 'rb') as csvfile:
nessusreader = csv.DictReader(csvfile)
for row in nessusreader:
pluginid = row['Plugin ID']
if pluginid != '':
pluginid_list.append(row['Plugin ID'])
print(pluginid_list)
count = pluginid_list.count(pluginid)
cve = row['CVE']
if count > 0:
protocollist.append(row['Protocol'])
print(protocollist)
portlist.append(row['Port'])
print(portlist)
print('Counted more than 1')
multiple = True
if multiple == True:
stringlist = ', '.join(protocollist)
newstring1 = stringlist
protocol = newstring1
stringlist2 = ', '.join(portlist)
newstring2 = stringlist2
port = newstring2
else:
protocol = row['Protocol']
port = row['Port']
cvss = row['CVSS']
risk = row['Risk']
host = row['Host']
name = row['Name']
synopsis = row['Synopsis']
description = row['Description']
solution = row['Solution']
seealso = row['See Also']
pluginoutput = row['Plugin Output']
with open(csv_file_output, 'w') as csvfile:
fieldnames = ['Plugin ID', 'CVE', 'CVSS', 'Risk', 'Host', 'Protocol', 'Port', 'Name', 'Synopsis', 'Description', 'Solution', 'See Also', 'Plugin Output']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'Plugin ID': pluginid, 'CVE': cve, 'CVSS': cvss, 'Risk': risk, 'Host': host, 'Protocol': protocol, 'Port': port, 'Name': name, 'Synopsis': synopsis, 'Description': description, 'Solution': solution, 'See Also': seealso, 'Plugin Output': pluginoutput})
There are probably a few errors in the code as I've been trying different things, but just wanted to show the code I've been working on to give more context to the issue. This code works if the data is only as shown in the CSV as there are only two items, however I introduced a third set of data with a different Plugin ID and it then added that to the list also, probably due to the if statement being set to > 0.

Categories

Resources