gremlin id column extraction - GLUE - python

I am trying to extract Neptune database vertices into CSV file which is failing extracting at id column. Below is the script i am trying to run in AWS GLUE Console.
import boto3
import os
import sys
import site
import json
import pandas as pd
from setuptools.command import easy_install
from importlib import reload
s3 = boto3.client('s3')
dir_path = os.path.dirname(os.path.realpath(__file__))
#os.path.dirname(sys.modules['__main__'].__file__)
install_path = os.environ['GLUE_INSTALLATION']
easy_install.main( ["--install-dir", install_path, "gremlinpython"] )
reload(site)
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.process.traversal import T
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
graph = Graph()
remoteConn = DriverRemoteConnection('wss://neptune-test-new-reader-1.c3nqs7vjaggx.eu-west-1.neptune.amazonaws.com:8182/gremlin','g')
g = graph.traversal().withRemote(remoteConn)
vertices_columns = ['id', 'label', 'region','country']
vertices = g.V().hasLabel('airport').limit(2).project('id','label','region','country').by('T.id').by('T.label').by('region').by('country').select(values).fold()
for v in vertices:
print(v)
Error:
Name 'Values' is not defined
Tried below script with for loop
import boto3
import os
import sys
import site
import json
import pandas as pd
from setuptools.command import easy_install
from importlib import reload
s3 = boto3.client('s3')
dir_path = os.path.dirname(os.path.realpath(__file__))
#os.path.dirname(sys.modules['__main__'].__file__)
install_path = os.environ['GLUE_INSTALLATION']
easy_install.main( ["--install-dir", install_path, "gremlinpython"] )
reload(site)
from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.process.traversal import T
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection
graph = Graph()
remoteConn = DriverRemoteConnection('wss://neptune-test-new-reader-1.c3nqs7vjaggx.eu-west-1.neptune.amazonaws.com:8182/gremlin','g')
g = graph.traversal().withRemote(remoteConn)
vertices_columns = ['id', 'label', 'region','country']
"""
vertices = g.V().hasLabel('airport').limit(2).project('id','label','region','country').by('T.id').by('T.label').by('region').by('country').select(values).fold()
for v in vertices:
print(v)
"""
#vertices = []
vertices = g.V().limit(1).valueMap(True).toList()
for v in vertices:
print(v)
for col in vertices_columns:
print(v[col])
#print(vertices)
Error:
Output of the print(v)
{T.id:1:'1',T.label:1:'airport','country':'US','region':'US-AL'}
Key Error: id

The values keyword used in select(values) is a reference to an enum defined as part of Column. In your code you can use select(Column.values) and you can include the definition using :
from gremlin_python.process.traversal import Column
Here is a Python example:
>>> g.V('3').project('id','label','code').by(T.id).by(T.label).by('code').toList()
[{'id': '3', 'label': 'airport', 'code': 'AUS'}]
>>> g.V('3').project('id','label','code').by(T.id).by(T.label).by('code').select(Column.values).toList()
[['3', 'airport', 'AUS']]
Note that T.id and T.label are not inside quotes.

Related

Why knn2 of spatialite finds 0 points?

So I add a set of points and try to find them like so:
import os
from datetime import datetime
import json
from pprint import pprint
from playhouse.sqlite_ext import *
from peewee import *
from playhouse.kv import *
from playhouse.shortcuts import model_to_dict
from playhouse.reflection import generate_models, print_model, print_table_sql
from IPython.display import SVG
import folium
db = SqliteExtDatabase('./my_database.db')
db.connect()
db.connection().enable_load_extension(True)
print(db.execute_sql("SELECT load_extension('mod_spatialite.so');").fetchall())
print(db.execute_sql("SELECT InitSpatialMetaData(); ").fetchall())
print(db.execute_sql("SELECT sqlite_version(), spatialite_version();").fetchall())
#print(db.execute_sql("CREATE VIRTUAL TABLE knn2 USING VirtualKNN2();").fetchall())
class BaseModel(Model):
class Meta:
database = db
class GeometryField(Field):
db_field = 'geometry'
def geometryFromWKT(wellknowntext):
return fn.ST_GeomFromText(wellknowntext)
class Tbl(BaseModel):
geometry = GeometryField()
models = generate_models(db)
Tbl.create_table()
Tbl.insert(geometry=geometryFromWKT('POINT(4 9)')).execute()
Tbl.insert(geometry=geometryFromWKT('POINT(9 9)')).execute()
Tbl.insert(geometry=geometryFromWKT('POINT(14 9)')).execute()
Tbl.insert(geometry=geometryFromWKT('POINT(4 19)')).execute()
Tbl.insert(geometry=geometryFromWKT('POINT(41 9)')).execute()
Tbl.insert(geometry=geometryFromWKT('POINT(14 19)')).execute()
#https://www.gaia-gis.it/gaia-sins/spatialite-sql-4.4.0.html
cmd = """SELECT AsGeoJSON(ST_Union(CastToXY(geometry)), 5, 3) from Tbl"""
data_all = db.execute_sql(cmd).fetchall()[0][0]
print(data_all)
cmd = """SELECT * FROM knn2
WHERE f_table_name = 'Tbl' AND ref_geometry = MakePoint(5, 5) AND f_geometry_column = 'geometry' AND radius = 1000.0;"""
data_knn = db.execute_sql(cmd).fetchall()
print(data_knn)
One can try it out here (simple with knn) or here (complicated with knn2).
The code outputs:
[(None,)]
[(1,)]
[('3.22.0', '5.1.0-devel')]
{"type":"MultiPoint","bbox":[4,9,41,19],"coordinates":[[4,9],[4,19],[9,9],[14,9],[14,19],[41,9]]}
[]
So there are points (one can see on the map) around Point(5, 5) yet none is found in fadius of 10 and 1000... Why and how to make knn2 work properly for points created from WKT in sqlite using spatialite?

module 'robin_stocks' has no attribute 'get_current_positions'

This is my first time using the robinhood api and I am trying out their documentation: https://readthedocs.org/projects/robin-stocks/downloads/pdf/latest/,
but one of their key functions is not working. When I call robin_stocks.get_current_positions() I get the error
AttributeError: module 'robin_stocks' has no attribute 'get_current_positions'
Here is my code:
import robin_stocks, json
from robin_stocks import *
import robin_stocks as r
import sys
import time
import requests
content = open('config.json').read()
config = json.loads(content)
login = r.login(config['username'],config['password'], store_session=True)
my_stocks = robin_stocks.build_holdings()
for key,value in my_stocks.items():
mystocks = key,value
print(mystocks)
WEIbalance = mystocks[1]['equity']
WEI = mystocks[0]
print('YY', WEI)
positions_data = robin_stocks.get_current_positions()
print('my equity', WEIbalance)
print(positions_data)
Is this an error on my part?
As of this commit get_current_positions was renamed to get_open_stock_positions() the corresponding issue can be found here

Tableau to Athena error when trying to read tables created with datatypes using python pyspark.sql.types lib

Currently using a glue job written in python to change data types of selected columns. After processing Tableau is unable to open my file and receive the following error message: An error occurred while loading the data.
[Simba]AthenaJDBC An error has been thrown from the AWS Athena client. HIVE_CANNOT_OPEN_SPLIT: Error opening Hive split
Athena is however able to open and read my files without any issues
import logging
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
#from urlparse import urlparse
from datetime import date, timedelta
import boto3
from pyspark.sql.types import IntegerType
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import LongType
from pyspark.sql.types import DecimalType
from pyspark.sql.functions import to_date,date_format,to_timestamp
def change_datatypes_swg(initialDF, parquet_datatype):
# assume no transforms
integer_columns = []
bigint_columns = []
decimal_columns = []
timestamp_columns = []
print ('test')
# create transformation map
transform_map = {
"table_name": {
"integer_columns": ["col1"],
"bigint_columns": ["col2","col3"],
"decimal_columns": ["col4"],
"timestamp_columns": [("timestamp","yyyy-MM-dd HH:mm:ss")],
}
}
print ('test2')
# populate transforms if needed
if log_type in transform_map:
parquet_path = parquet_path.replace("/parquet", "/parquet_datatype/")
integer_columns = transform_map[log_type]["integer_columns"]
bigint_columns = transform_map[log_type]["bigint_columns"]
decimal_columns = transform_map[log_type]["decimal_columns"]
timestamp_columns = transform_map[log_type]["timestamp_columns"]
formattedDF = initialDF
# convert to integers
for integer_column in integer_columns:
formattedDF = formattedDF.astype(integer_column, col(integer_column).cast(IntegerType()))
print ('integer_column', formattedDF )
for bigint_column in bigint_columns:
formattedDF = formattedDF.withColumn(bigint_column, col(bigint_column).cast(LongType()))
print ('bigint_column', formattedDF)
for decimal_column in decimal_columns:
formattedDF = formattedDF.withColumn(decimal_column, col(decimal_column).cast(DecimalType()))
print ('decimal_column', formattedDF)
# convert to timestamp
for timestamp_column in timestamp_columns:
column_name, column_format = timestamp_column
formattedDF = formattedDF.withColumn(column_name, to_timestamp(col(column_name), column_format))
print ('timestamp_column', formattedDF)
print('FormattedDFS', formattedDF)
return (formattedDF, parquet_datatype)
I was wondering if anyone has come across an issue like this. I don't understand how the files are able to be read using Athena and Redshift but not Tableau to Athena. I was wondering if anyone has any alternatives to using pyspark.sql.types. Is there another lib that can work?

Python filtering for cloudformation stacks with environment names

Updated question:
I need to get cloudformation stacks according to the environment. Below is the code I am using for this:
#!/usr/bin/env python
import boto3
import datetime
from datetime import date
import subprocess
import re, itertools
from collections import defaultdict
regions = ['us-west-2']
env_names = ["dev", "test", "stage"]
stack_names_found = defaultdict(list)
for region in regions:
session = boto3.session.Session(region_name=region)
cf_client = session.resource('cloudformation')
for i in cf_client.stacks.all():
StackStatus = i.stack_status
Createdtime = i.creation_time
StackName1 = i.stack_name
for env_name in env_names:
if ('-' + env_name + '-') in StackName1:
stack_names_found[env_name].append(StackName1)
output = {'StackName': stack_names_found,
'Createdtime': Createdtime,
'Status': StackStatus
}
print(output)
StackName in output looks like this:
{'StackName':defaultdict(<class 'list'>, {'test': ['customer1-test-server1', 'customer2-test-server1', 'customer3-test-server1','customer3-test-server1', 'customer1-test-server2]})
Instead of:
['customer1-test-server1']
['customer2-test-server1']
['customer3-test-server1']
['customer3-test-server1']
You can have a look at the following version, which uses defaultdict and creates dictionary of stack names for each env_name:
#!/usr/bin/env python
import boto3
import csv
import datetime
from datetime import date
import subprocess
import re, itertools
from collections import defaultdict
regions = ['us-west-2']
env_names = ["dev", "test", "stage"]
stack_names_found = defaultdict(list)
for region in regions:
session = boto3.session.Session(region_name=region)
cf_client = session.resource('cloudformation')
for i in cf_client.stacks.all():
StackStatus = i.stack_status
Createdtime = i.creation_time
StackName1 = i.stack_name
for env_name in env_names:
if ('-' + env_name + '-') in StackName1:
stack_names_found[env_name].append(StackName1)
print(stack_names_found)
Please note that I haven't run the code, thus some adjustment may need to be needed to make it fully work.

how to output a graph from csv using matplotlib and displaying it into browser using django?

i wanted to create a django aplication which shows the graph using the data in the csv. I write a code in views file of the application repository. and also i placed the csv file along with views.py file. The codes are as follows
#from django.shortcuts import render
import sys
from sys import *
import numpy as np
import matplotlib.pyplot as xy
from datetime import datetime
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
from matplotlib.ticker import Formatter
import PIL
import PIL.Image
import StringIO
from django.http import HttpResponse
from django.template import RequestContext, loader
import csv
def graph():
a=[]
b=[]
c=[]
h=[]
a1=[]
b1=[]
c1=[]
j= open("Tier 1 Lake.csv")
for row in csv.reader(j):
a.append(row[8])
b.append(row[28])
c.append(row[2])
for i,v in enumerate(a):
if b[i]:
a1.append(a[i])
b1.append(b[i])
c1.append(c[i])
d=[item for item in range(len(c1)) if c1[item] == 'JOR-01-L']
e=[a1[i] for i in d]
f=[b1[i] for i in d]
FORMAT = '%m/%d/%Y'
s = sorted(zip(e,f), key = lambda x: datetime.strptime(x[0],'%m/%d/%Y'))
r= [x[1] for x in s]
t= [x[0] for x in s]
g= len(e)
for k in range(0,g):
h.append(k)
fig, ax = xy.subplots()
fig.autofmt_xdate()
xy.plot(h,r)
xy.xticks(h,t)
xy.plot(r, '-o', ms=10, lw=1, alpha=1, mfc='orange')
xy.xlabel('Sample Dates')
xy.ylabel('Air Temperature')
xy.title('Tier 1 Lake Graph (JOR-01-L)')
xy.grid(True)
xy.show()
buffer = StringIO.StringIO()
canvas = pylab.get_current_fig_manager().canvas
canvas.draw()
graphIMG = PIL.Image.fromstring("RGB", canvas.get_width_height(), canvas.tostring_rgb())
graphIMG.save(buffer,"PNG")
xy.close()
return HttpResponse(buffer.getvalue(), content_type="image/png")
Links and urls are functioning absolutely fine.
THE ERROR I AM GETTING IS " No such file or directory: 'Tier 1 Lake.csv'-- IO Error"

Categories

Resources