Import JSON to NEO4J using py2neo - python

I am trying to import a JSON file obtained thru an API request to StackOverflow to NEO4J. I have been following this tutorial. However, I get errors like the following while trying to execute the query:
File "/Users/ahmedov/anaconda/lib/python2.7/site-packages/py2neo/cypher/core.py", line 306, in commit
return self.post(self.__commit or self.__begin_commit)
File "/Users/ahmedov/anaconda/lib/python2.7/site-packages/py2neo/cypher/core.py", line 261, in post
raise self.error_class.hydrate(error)
File "/Users/ahmedov/anaconda/lib/python2.7/site-packages/py2neo/cypher/error/core.py", line 54, in hydrate
error_cls = getattr(error_module, title)
AttributeError: 'module' object has no attribute 'SyntaxError'
I am using the following code:
import os
import requests
from py2neo import neo4j
from py2neo import Graph
from py2neo import Path, authenticate
# set up authentication parameters
authenticate("localhost:7474", "neo4j", "neo4j")
# connect to authenticated graph database
#graph = Graph("http://localhost:7474/db/data/")
# Connect to graph and add constraints.
neo4jUrl = os.environ.get('NEO4J_URL',"http://localhost:7474/db/data/")
graph = neo4j.Graph(neo4jUrl)
# Connect to graph and add constraints.
#neo4jUrl = os.environ.get('NEO4J_URL',"http://localhost:7474/db/data/")
#graph = neo4j.GraphDatabaseService(neo4jUrl)
# Add uniqueness constraints.
graph.cypher.execute("CREATE CONSTRAINT ON (q:Question) ASSERT q.id IS UNIQUE;")
# Build URL.
apiUrl ="https://api.stackexchange.com/2.2/questions?pagesize=100&order=desc&sort=creation&tagged=neo4j&site=stackoverflow&filter=!5-i6Zw8Y)4W7vpy91PMYsKM-k9yzEsSC1_Uxlf"
# Send GET request.
json = requests.get(apiUrl, headers = {"accept":"application/json"}).json()
# Build query.
query = """
UNWIND data.items as q
MERGE (question:Question {id:q.question_id}) ON CREATE
SET question.title = q.title, question.share_link = q.share_link, question.favorite_count = q.favorite_count
MERGE (owner:User {id:q.owner.user_id}) ON CREATE SET owner.display_name = q.owner.display_name
MERGE (owner)-[:ASKED]->(question)
FOREACH (tagName IN q.tags | MERGE (tag:Tag {name:tagName}) MERGE (question)-[:TAGGED]->(tag))
FOREACH (a IN q.answers |
MERGE (question)<-[:ANSWERS]-(answer:Answer {id:a.answer_id})
MERGE (answerer:User {id:a.owner.user_id}) ON CREATE SET answerer.display_name = a.owner.display_name
MERGE (answer)<-[:PROVIDED]-(answerer)
)
"""
statement = "MERGE (n:Person {name:{N}}) RETURN n"
results = graph.cypher.run(query,json=json)
tx = graph.cypher.begin()
def add_names(*names):
for name in names:
tx.append(statement, {"N": name})
tx.process()
add_names("Homer", "Marge", "Bart", "Lisa", "Maggie")
add_names("Peter", "Lois", "Chris", "Meg", "Stewie")
tx.append(query,)
tx.commit()
# Send Cypher query.
The problem originates from the following line:
results = graph.cypher.run(query,json=json)
I had to change the above line to adjust it to the newer py2neo api. The original line looked like this:
neo4j.CypherQuery(graph, query).run(json=json)
So basically, I need to find a way to tell neo4j that I need to process the JSON file using the given query. I tried to read the documentary and searching the web with no luck. Any help would be appreciated.

A couple of things to make your script work :
from py2neo import neo4j is not a valid dependency anymore
In your query, you pass a json map as parameter but you don't use the parameters syntax in the query, I added WITH {json} as data in the beginning of the query.
Added secure=False for the connection
The last tx.append(query,) is not needed.
Working script :
import os
import requests
#from py2neo import neo4j
from py2neo import Graph
from py2neo import Path, authenticate
# set up authentication parameters
authenticate("localhost:7474", "neo4j", "neo4j")
# connect to authenticated graph database
#graph = Graph("http://localhost:7474/db/data/")
# Connect to graph and add constraints.
neo4jUrl = os.environ.get('NEO4J_URL',"http://localhost:7474/db/data/")
graph = Graph(neo4jUrl,secure=False)
# Connect to graph and add constraints.
#neo4jUrl = os.environ.get('NEO4J_URL',"http://localhost:7474/db/data/")
#graph = neo4j.GraphDatabaseService(neo4jUrl)
# Add uniqueness constraints.
graph.run("CREATE CONSTRAINT ON (q:Question) ASSERT q.id IS UNIQUE;")
# Build URL.
apiUrl ="https://api.stackexchange.com/2.2/questions?pagesize=100&order=desc&sort=creation&tagged=neo4j&site=stackoverflow&filter=!5-i6Zw8Y)4W7vpy91PMYsKM-k9yzEsSC1_Uxlf"
# Send GET request.
json = requests.get(apiUrl, headers = {"accept":"application/json"}).json()
#print(json);
# Build query.
query = """
WITH {json} as data
UNWIND data.items as q
MERGE (question:Question {id:q.question_id}) ON CREATE
SET question.title = q.title, question.share_link = q.share_link, question.favorite_count = q.favorite_count
MERGE (owner:User {id:q.owner.user_id}) ON CREATE SET owner.display_name = q.owner.display_name
MERGE (owner)-[:ASKED]->(question)
FOREACH (tagName IN q.tags | MERGE (tag:Tag {name:tagName}) MERGE (question)-[:TAGGED]->(tag))
FOREACH (a IN q.answers |
MERGE (question)<-[:ANSWERS]-(answer:Answer {id:a.answer_id})
MERGE (answerer:User {id:a.owner.user_id}) ON CREATE SET answerer.display_name = a.owner.display_name
MERGE (answer)<-[:PROVIDED]-(answerer)
)
"""
statement = "MERGE (n:Person {name:{N}}) RETURN n"
results = graph.run(query,json=json)
tx = graph.begin()
def add_names(*names):
for name in names:
tx.append(statement, {"N": name})
tx.process()
add_names("Homer", "Marge", "Bart", "Lisa", "Maggie")
add_names("Peter", "Lois", "Chris", "Meg", "Stewie")
#tx.append(query,)
tx.commit()
Result :

Using a recent version of py2neo, the syntax changes a little bit. Rather than use {json}, is necessary to use $json as explained below:
from py2neo import Graph
graph = Graph(NEO4J_URI, user=NEO4J_USER, password=NEO4j_PASSWORD)
json = ['22644198319', '15383242341']
result = graph.run(
"""WITH $json as data
UNWIND data as id
MATCH (c:Person {id: id}) RETURN c.name""",
json=json
)
data_frame_result = result.to_data_frame()
print(data_frame_result)
Making these changes in the #christophe-willemsen's example, the code can run in the newest versions.

Related

AWS Glue create_partition using boto3 successful, but Athena not showing results for query

I have a glue script to create new partitions using create_partition(). The glue script is running successfully, and i could see the partitions in the Athena console when using SHOW PARTITIONS. For glue script create_partitions, I did refer to this sample code here : https://medium.com/#bv_subhash/demystifying-the-ways-of-creating-partitions-in-glue-catalog-on-partitioned-s3-data-for-faster-e25671e65574
When I try to run a Athena query for a given partition which was newly added, I am getting no results.
Is it that I need to trigger the MSCK command, even if I add the partitions using create_partitions. Appreciate any suggestions please
.
I have got the solution myself, wanted to share with SO community, so it would be useful someone. The following code when run as a glue job, creates partitions, and can also be queried in Athena for the new partition columns. Please change/add the parameter values db name, table name, partition columns as needed.
import boto3
import urllib.parse
import os
import copy
import sys
# Configure database / table name and emp_id, file_id from workflow params?
DATABASE_NAME = 'my_db'
TABLE_NAME = 'enter_table_name'
emp_id_tmp = ''
file_id_tmp = ''
# # Initialise the Glue client using Boto 3
glue_client = boto3.client('glue')
#get current table schema for the given database name & table name
def get_current_schema(database_name, table_name):
try:
response = glue_client.get_table(
DatabaseName=DATABASE_NAME,
Name=TABLE_NAME
)
except Exception as error:
print("Exception while fetching table info")
sys.exit(-1)
# Parsing table info required to create partitions from table
table_data = {}
table_data['input_format'] = response['Table']['StorageDescriptor']['InputFormat']
table_data['output_format'] = response['Table']['StorageDescriptor']['OutputFormat']
table_data['table_location'] = response['Table']['StorageDescriptor']['Location']
table_data['serde_info'] = response['Table']['StorageDescriptor']['SerdeInfo']
table_data['partition_keys'] = response['Table']['PartitionKeys']
return table_data
#prepare partition input list using table_data
def generate_partition_input_list(table_data):
input_list = [] # Initializing empty list
part_location = "{}/emp_id={}/file_id={}/".format(table_data['table_location'], emp_id_tmp, file_id_tmp)
input_dict = {
'Values': [
emp_id_tmp, file_id_tmp
],
'StorageDescriptor': {
'Location': part_location,
'InputFormat': table_data['input_format'],
'OutputFormat': table_data['output_format'],
'SerdeInfo': table_data['serde_info']
}
}
input_list.append(input_dict.copy())
return input_list
#create partition dynamically using the partition input list
table_data = get_current_schema(DATABASE_NAME, TABLE_NAME)
input_list = generate_partition_input_list(table_data)
try:
create_partition_response = glue_client.batch_create_partition(
DatabaseName=DATABASE_NAME,
TableName=TABLE_NAME,
PartitionInputList=input_list
)
print('Glue partition created successfully.')
print(create_partition_response)
except Exception as e:
# Handle exception as per your business requirements
print(e)

Get all microsft.vsts... from azure devops with python

Hi i have succeeded ton connect azure devops with python and retrieve data from system fields with wiql query.but i Can not execute programm and i got an error like this
KeyError[Microsoft.vsts.common.acceptedcriteria] i do not understand where is the probleme because i get the value for priority and not for acceptedcriteria. what am i missing ? any help would be appreciated
Every time i exécuté this Quercy i get this error
from vsts.vss_connection import VssConnection
from msrest.authentication import BasicAuthentication
import json
from vsts.work_item_tracking.v4_1.models.wiql import Wiql
def emit(msg, *args):
print(msg % args)
def print_work_item(work_item):
emit(
"{0} ,{1},{2},{3}".format(
work_item.fields["System.WorkItemType"],
work_item.id,
work_item.fields["System.Title"],
work_item.fields["Microsoft.VSTS.Common.AcceptanceCriteria"]
work_item.fields["Microsoft.VSTS.Common.Priority"]
)
)
personal_access_token = 'xbxbxxbxhhdhdjdkdkddkdkdkdkdkdkkdkdkd'
organization_url = 'https://dev.azure.com/YourorgName'
# Create a connection to the org
credentials = BasicAuthentication('', personal_access_token)
connection = VssConnection(base_url=organization_url, creds=credentials)
wiql = Wiql(
query="""select [System.Id],[Microsoft.VSTS.Common.AcceptanceCriteria],[System.Title],[System.WorkItemType],[Microsoft.VSTS.Common.Priority] From WorkItems """
)
wit_client = connection.get_client('vsts.work_item_tracking.v4_1.work_item_tracking_client.WorkItemTrackingClient')
wiql_results = wit_client.query_by_wiql(wiql).work_items
if wiql_results:
# WIQL query gives a WorkItemReference with ID only
# => we get the corresponding WorkItem from id
work_items = (
wit_client.get_work_item(int(res.id)) for res in wiql_results
As a simple answer, you can update your print function to
def print_work_item(work_item):
emit(
"{0} ,{1},{2}".format(
work_item.fields["System.WorkItemType"],
work_item.id,
work_item.fields["System.Title"]
)
)
Some fields (like Microsoft.VSTS.Common.AcceptanceCriteria and Microsoft.VSTS.Common.Priority) may not be represented in some work item types. Therefore, before printing a field you have to check its availability in fields dictionary.

Elasticsearch DSL update by scan?

Is it possible to update with the scan() method from Python's elasticsearch-dsl library?
For example:
search = Search(index=INDEX).query(query).params(request_timeout=6000)
print('Scanning Query and updating.')
for hit in search.scan():
_id = hit.meta.id
# sql query to get the record from th database
record = get_record_by_id(_id)
hit.column1 = record['column1']
hit.column2 = record['column2']
# not sure what use to update here
This can be done by using the elasticsearch client:
for hit in search.scan():
_id = hit.meta.id
print(_id)
# get data from database
record = get_record_by_id(_id)
body = dict(record)
body.pop('_id')
# update the elastichsearch client
elastic_client.update(
index=INDEX,
id=_id,
body={"doc": body}
)

Cosmos DB - Delete Document with Python

In this SO question I had learnt that I cannot delete a Cosmos DB document using SQL.
Using Python, I believe I need the DeleteDocument() method. This is how I'm getting the document ID's that are required (I believe) to then call the DeleteDocument() method.
# set up the client
client = document_client.DocumentClient()
# use a SQL based query to get a bunch of documents
query = { 'query': 'SELECT * FROM server s' }
result_iterable = client.QueryDocuments('dbs/DB/colls/coll', query, options)
results = list(result_iterable);
for x in range(0, len (results)):
docID = results[x]['id']
Now, at this stage I want to call DeleteDocument().
The inputs into which are document_link and options.
I can define document_link as something like
document_link = 'dbs/DB/colls/coll/docs/'+docID
And successfully call ReadAttachments() for example, which has the same inputs as DeleteDocument().
When I do however, I get an error...
The partition key supplied in x-ms-partitionkey header has fewer
components than defined in the the collection
...and now I'm totally lost
UPDATE
Following on from Jay's help, I believe I'm missing the partitonKey element in the options.
In this example, I've created a testing database, it looks like this
So I think my partition key is /testPART
When I include the partitionKey in the options however, no results are returned, (and so print len(results) outputs 0).
Removing partitionKey means that results are returned, but the delete attempt fails as before.
# Query them in SQL
query = { 'query': 'SELECT * FROM c' }
options = {}
options['enableCrossPartitionQuery'] = True
options['maxItemCount'] = 2
options['partitionKey'] = '/testPART'
result_iterable = client.QueryDocuments('dbs/testDB/colls/testCOLL', query, options)
results = list(result_iterable)
# should be > 0
print len(results)
for x in range(0, len (results)):
docID = results[x]['id']
print docID
client.DeleteDocument('dbs/testDB/colls/testCOLL/docs/'+docID, options=options)
print 'deleted', docID
According to your description, I tried to use pydocument module to delete document in my azure document db and it works for me.
Here is my code:
import pydocumentdb;
import pydocumentdb.document_client as document_client
config = {
'ENDPOINT': 'Your url',
'MASTERKEY': 'Your master key',
'DOCUMENTDB_DATABASE': 'familydb',
'DOCUMENTDB_COLLECTION': 'familycoll'
};
# Initialize the Python DocumentDB client
client = document_client.DocumentClient(config['ENDPOINT'], {'masterKey': config['MASTERKEY']})
# use a SQL based query to get a bunch of documents
query = { 'query': 'SELECT * FROM server s' }
options = {}
options['enableCrossPartitionQuery'] = True
options['maxItemCount'] = 2
result_iterable = client.QueryDocuments('dbs/familydb/colls/familycoll', query, options)
results = list(result_iterable);
print(results)
client.DeleteDocument('dbs/familydb/colls/familycoll/docs/id1',options)
print 'delete success'
Console Result:
[{u'_self': u'dbs/hitPAA==/colls/hitPAL3OLgA=/docs/hitPAL3OLgABAAAAAAAAAA==/', u'myJsonArray': [{u'subId': u'sub1', u'val': u'value1'}, {u'subId': u'sub2', u'val': u'value2'}], u'_ts': 1507687788, u'_rid': u'hitPAL3OLgABAAAAAAAAAA==', u'_attachments': u'attachments/', u'_etag': u'"00002100-0000-0000-0000-59dd7d6c0000"', u'id': u'id1'}, {u'_self': u'dbs/hitPAA==/colls/hitPAL3OLgA=/docs/hitPAL3OLgACAAAAAAAAAA==/', u'myJsonArray': [{u'subId': u'sub3', u'val': u'value3'}, {u'subId': u'sub4', u'val': u'value4'}], u'_ts': 1507687809, u'_rid': u'hitPAL3OLgACAAAAAAAAAA==', u'_attachments': u'attachments/', u'_etag': u'"00002200-0000-0000-0000-59dd7d810000"', u'id': u'id2'}]
delete success
Please notice that you need to set the enableCrossPartitionQuery property to True in options if your documents are cross-partitioned.
Must be set to true for any query that requires to be executed across
more than one partition. This is an explicit flag to enable you to
make conscious performance tradeoffs during development time.
You could find above description from here.
Update Answer:
I think you misunderstand the meaning of partitionkey property in the options[].
For example , my container is created like this:
My documents as below :
{
"id": "1",
"name": "jay"
}
{
"id": "2",
"name": "jay2"
}
My partitionkey is 'name', so here I have two paritions : 'jay' and 'jay1'.
So, here you should set the partitionkey property to 'jay' or 'jay2',not 'name'.
Please modify your code as below:
options = {}
options['enableCrossPartitionQuery'] = True
options['maxItemCount'] = 2
options['partitionKey'] = 'jay' (please change here in your code)
result_iterable = client.QueryDocuments('dbs/db/colls/testcoll', query, options)
results = list(result_iterable);
print(results)
Hope it helps you.
Using the azure.cosmos library:
install and import azure cosmos package:
from azure.cosmos import exceptions, CosmosClient, PartitionKey
define delete items function - in this case using the partition key in query:
def deleteItems(deviceid):
client = CosmosClient(config.cosmos.endpoint, config.cosmos.primarykey)
# Create a database if not exists
database = client.create_database_if_not_exists(id=azure-cosmos-db-name)
# Create a container
# Using a good partition key improves the performance of database operations.
container = database.create_container_if_not_exists(id=container-name, partition_key=PartitionKey(path='/your-pattition-path'), offer_throughput=400)
#fetch items
query = f"SELECT * FROM c WHERE c.device.deviceid IN ('{deviceid}')"
items = list(container.query_items(query=query, enable_cross_partition_query=False))
for item in items:
container.delete_item(item, 'partition-key')
usage:
deviceid=10
deleteItems(items)
github full example here: https://github.com/eladtpro/python-iothub-cosmos

pass in table name to sqlalchemy query through url in flask

In short, how do i
var="TableName"
models.var.query.all()
explanation
My goal is to allow the user to change the order of list of items.
I set up an ajax call that sends an array of id's to the api below.
It works if i hard code the query, and make a api view per table.
my problem is that i want "table" to fill in this line
models.table.query.filter_by(id=item).first()
to complete the query.
here is the view api which gives me an error "no attribute 'table'"
#app.route('/order/<table>')
def order(table):
# import pdb;pdb.set_trace()
sortedItems = request.args.listvalues()[0]
o=1
import pdb;pdb.set_trace()
for item in sortedItems:
grab = models.table.query.filter_by(id=item).first()
grab.order=o
o=o+1
db.session.commit()
return jsonify(result=sortedItems)
You can use getattr():
>>> var = 'table_name'
>>> table = getattr(models, var)
>>> table.query.filter_by(id=item).first()
getattr() will raise an AttributeError if the attribute your trying to get does not exist.
Example for your order()-function:
#app.route('/order/<table>')
def order(table):
sortedItems = request.args.listvalues()[0]
o=1
table = getattr(models, table)
for item in sortedItems:
grab = table.query.filter_by(id=item).first()
grab.order=o
o=o+1
db.session.commit()
return jsonify(result=sortedItems)

Categories

Resources