how to make a parent child relationship using python Elasticsearch client? - python

I am using python's elasticsearch client to make searchable pdfs. One group of pdf's is called surveys. I would like to make a parent child relationship where the parent consist of the group of pdf's and the child index will be the filenames within the group. However, I keep getting errors. My code is below:
in settings.py:
import elasticsearch
from elasticsearch import Elasticsearch, RequestsHttpConnection
ES_CLIENT = Elasticsearch(
['http://127.0.0.1:9200/'], #could be 9201,9300,9301
connection_class=RequestsHttpConnection
)
in my command.py:
from elasticsearch import Elasticsearch
from django.conf import settings
self.indices_client = settings.ES_CLIENT
print "create parent"
self.indices_client.index(
# op_type='create',
id='surveys',
doc_type='parent',
body={ "properties": { 'title': {'type': 'string', 'index': 'not_analyzed'}}},
index="surveys"
)
# create child index file_name with parent index surveys
# self.indices_client.create(index=child_index)
print 'create child'
self.indices_client.index(
doc_type='child',
body= upload_models.Survey._meta.es_mapping,
index=child_index,
parent='surveys'
)
print 'post child'
I keep getting this error:
raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info)
elasticsearch.exceptions.RequestError: TransportError(400, u'illegal_argument_exception', u"Can't specify parent if no parent field has been configured")

During child mapping in:
self.indices_client.index(
doc_type='child',
body= upload_models.Survey._meta.es_mapping,
index=child_index,
parent='surveys'
)
parent parameter here is ID of the parent document, so you can't use it for your purpose, instead try:
self.indices_client.index(
doc_type='child',
body= {
doc_type: {
'_parent': {"type": "surveys"},
'properties': upload_models.Survey._meta.es_mapping
}
}
index=child_index
)
Or try another function - put_mapping(*args, **kwargs):
self.indices_client.indices.put_mapping(
doc_type='child',
index=child_index,
body= {
doc_type: {
'_parent': {"type": "surveys"},
'properties': upload_models.Survey._meta.es_mapping
}
}
index=child_index
)

Related

How to migrate a table which contains a JSON column from AWS RedShift to BigQuery in Python?

I want to migrate the data tables from AWS database to BigQuery. I have a specific table named sampletable which includes id, user_id and log. Log is a JSON field that contains a dictionary which consists of keys and its respective values.
'reason': {
'id': 5,
'name': 'Sample name'
'contact': {
number = 123
address = None
}
},
'subreason': {
'id': 80,
'name': 'Sample name',
'is_active': True,
'created_at': '2022-07-18T18:33:28.911Z',
'deleted_at': None,
'complaint_id': 5,
},
This is the function that loads the data from the table to BigQuery:
def load_data(table_id, data):
print("load_data::Writing records to table", table_id)
job_config = bigquery.LoadJobConfig(
write_disposition="WRITE_APPEND",
schema=[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("user_id", "INT64"),
bigquery.SchemaField("log", "JSON"),
],
)
try:
start = time.time()
job = client.load_table_from_dataframe(
data, table_id, job_config=job_config
)
job.result()
end = time.time()
print("load_data::Time taken for writing " + str(data.shape[0]) + " records: ", end - start, "s")
except Exception as e:
print("load_data::exception", e)
print("load_data::Could not establish connection with Google BigQuery. Terminating program")
conn.close()
sys.exit()
However, an exception arises. The exception is that "exception cannot mix list and non-list, non-null values".
I tried changing the schema in this way:
schema=[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("user_id", "INT64"),
bigquery.SchemaField("log", "RECORD"), fields=
[
bigquery.SchemaField("reason", "RECORD", fields=
[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("name", "STRING")
bigquery.SchemaField("contact", "RECORD", fields=
[
bigquery.SchemaField("number", "STRING")
bigquery.SchemaField("address," "STRING"))
]
]),
bigquery.SchemaField("subreason", "RECORD", fields=
[
bigquery.SchemaField("id", "INT64"),
bigquery.SchemaField("name", "STRING")
bigquery.SchemaField("is_active", "BOOLEAN")
bigquery.SchemaField("created_at", "TIMESTAMP")
bigquery.SchemaField("deleted_at", "TIMESTAMP")
bigquery.SchemaField("complaint_id", "INT64")
]),
])
However, I get the exception " with type dict: was expecting tuple of (key, value) pair "
Can anyone guide me in this issue as I am new to data migration of JSON columns in tables? What is the proper way to modify the schema to accept the JSON columns for migration?
You can try and consider below approach.
In this approach, you will be loading the data as JSON data type in BigQuery. However, there will be manual adjustment on the JSON file since BigQuery accepts new-line delimited JSON for data ingestion. See below sample updated file json file.
{"log":{"reason":{"contact":{"address": null,"number": 123},"id": 5,"name": "Sample name"},"subreason": {"complaint_id": 5,"created_at": "2022-07-18T18:33:28.911Z","deleted_at": "None","id": 80,"is_active": true,"name": "Sample name"}}}
Notice that I compressed the JSON into one key which is named "log" and also compressed it into one line to satisfy new-line delimited JSON.
Below is the python code I used to ingest the data:
table_id = "your-project.-your-dataset.your-table"
file_path = "/path/of/your_json_file.json"
def load_table_file(file_path, table_id):
# [START bigquery_load_from_file]
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
# TODO(developer): Set table_id to the ID of the table to create.
job_config = bigquery.LoadJobConfig(
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, autodetect=True,
#write_disposition="WRITE_APPEND",
schema=[
bigquery.SchemaField("log", "JSON"),
],
)
with open(file_path, "rb") as source_file:
job = client.load_table_from_file(source_file, table_id, job_config=job_config)
job.result() # Waits for the job to complete.
table = client.get_table(table_id) # Make an API request.
print(
"Loaded {} rows and {} columns to {}".format(
table.num_rows, len(table.schema), table_id
)
)
# [END bigquery_load_from_file]
return table
load_table_file(file_path, table_id)
Output:

Postgres and SqlAlchemy not updating rows properly

Whenever I execute an update statement using a session with SqlAlchemy and then call commit(), it will rarely update the database.
Here is my environment:
I have two servers running. One is for my database the other is for my python server.
Database Server:
Postgres v9.6 - On Amazon's RDS
Server with Python
Linux 3.13.0-65-generic x86_64 - On an Amazon EC2 Instance
SqlAlchemy v1.1.5
Python v3.4.3
Flask 0.11.1
Also, I use pgAdmin 4 for querying my table.
The files of importance:
server/models/category.py
from sqlalchemy.orm import backref
from .. import db
from flask import jsonify
class Category(db.Model):
__tablename__ = "categories"
id = db.Column(db.Integer, primary_key=True)
cat_name = db.Column(db.String(80))
includes = db.Column(db.ARRAY(db.String), default=[])
excludes = db.Column(db.ARRAY(db.String), default=[])
parent_id = db.Column(db.ForeignKey('categories.id', ondelete='SET NULL'), nullable=True, default=None)
subcategories = db.relationship('Category', backref=backref(
'categories',
remote_side=[id],
single_parent=True,
cascade="all, delete-orphan"
))
assigned_user = db.Column(db.String(80), nullable=True, default=None)
def to_dict(self):
return dict(
id=self.id,
cat_name=self.cat_name,
parent_id=self.parent_id,
includes=self.includes,
excludes=self.excludes,
assigned_user=self.assigned_user,
)
def json(self):
return jsonify(self.to_dict())
def __repr__(self):
return "<%s %r>" % (self.__class__, self.to_dict())
class CategoryOperations:
...
#staticmethod
def update_category(category):
return """
UPDATE categories
SET cat_name='{0}',
parent_id={1},
includes='{2}',
excludes='{3}',
assigned_user={4}
WHERE id={5}
RETURNING cat_name, parent_id, includes, excludes, assigned_user
""".format(
category.cat_name,
category.parent_id if category.parent_id is not None else 'null',
"{" + ",".join(category.includes) + "}",
"{" + ",".join(category.excludes) + "}",
"'" + category.assigned_user + "'" if category.assigned_user is not None else 'null',
category.id
)
#staticmethod
def update(category, session):
print("Updating category with id: " + str(category.id))
stmt = CategoryOperations.update_category(category)
print(stmt)
row_updated = session.execute(stmt).fetchone()
return Category(
id=category.id,
cat_name=row_updated[0],
parent_id=row_updated[1],
includes=row_updated[2],
excludes=row_updated[3],
assigned_user=row_updated[4]
)
...
server/api/category.py
from flask import jsonify, request
import json
from .api_utils.utils import valid_request as is_valid_request
from . import api
from ..models.category import Category, CategoryOperations
from ..models.users_categories import UsersCategoriesOperations, UsersCategories
from ..models.listener_item import ListenerItemOperations, ListenerItem
from ..models.user import UserOperations
from ..schemas.category import category_schema
from .. import get_session
...
#api.route('/categories/<int:id>', methods=['PUT'])
def update_category(id):
category_json = request.json
if category_json is None:
return "Bad Request: Request not sent as json", 400
valid_json, json_err = is_valid_request(category_json, ['cat_name', 'parent_id', 'includes', 'excludes', 'assigned_user'], "and")
if not valid_json:
return json_err, 400
category = Category(
id=id,
cat_name=category_json['cat_name'],
parent_id=category_json['parent_id'],
includes=category_json['includes'],
excludes=category_json['excludes'],
assigned_user=category_json['assigned_user'],
)
session = get_session()
try:
updated_category = CategoryOperations.update(category, session)
session.commit()
print(updated_category.to_dict())
return jsonify(updated_category.to_dict()), 200
except Exception as e:
print("ROLLBACK")
print(e)
session.rollback()
return str(e), 500
...
There is one more file that will probably be useful in this case:
server/__init__.py
import sqlalchemy as sa
from flask import Flask
from flask_marshmallow import Marshmallow
from flask_sqlalchemy import SQLAlchemy
from config import config
from sqlalchemy.orm import scoped_session, sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from flask_cors import CORS, cross_origin
from .db_config import CONFIG
db = SQLAlchemy()
ma = Marshmallow()
Engine = sa.create_engine(
CONFIG.POSTGRES_URL,
client_encoding='utf8',
pool_size=20,
max_overflow=0
)
Session = sessionmaker(bind=Engine)
conn = Engine.connect()
def get_session():
return Session(bind=conn)
def create_app(config_name):
app = Flask(__name__, static_url_path="/app", static_folder="static")
app_config = config[config_name]()
print(app_config)
app.config.from_object(app_config)
from .api import api as api_blueprint
app.register_blueprint(api_blueprint, url_prefix='/api')
from .api.routes import routes
routes(app)
from .auth import authentication
authentication(app)
db.init_app(app)
ma.init_app(app)
CORS(app)
...
return app
To explain a little more with the environment and files I have given, let's say I have a row in my categories table like so:
{
"assigned_user": null,
"cat_name": "Category Name Before",
"excludes": [
"exclude1",
"excludeBefore"
],
"id": 2,
"includes": [
"include1",
"include2"
],
"parent_id": null
}
When I do a PUT request to /api/categories/2 with the body as:
{
"assigned_user": null,
"cat_name": "Category Name 1",
"excludes": [
"exclude1",
"exclude2"
],
"id": 2,
"includes": [
"include1",
"include2"
],
"parent_id": null
}
During the request, I print out the SQL Statement that my PUT request created (for testing) and I get this:
UPDATE categories
SET cat_name='Category Name 1',
parent_id=null,
includes='{include1,include2}',
excludes='{exclude1,exclude2}',
assigned_user=null
WHERE id=2
RETURNING cat_name, parent_id, includes, excludes, assigned_user
After it commits the UPDATE statement, it then returns the response. I get the updated object back like so:
{
"assigned_user": null,
"cat_name": "Category Name 1",
"excludes": [
"exclude1",
"exclude2"
],
"id": 2,
"includes": [
"include1",
"include2"
],
"parent_id": null
}
When I do a GET request with this URL: /api/categories/2 and I get the same object too like so:
{
"assigned_user": null,
"cat_name": "Category Name 1",
"excludes": [
"exclude1",
"exclude2"
],
"id": 2,
"includes": [
"include1",
"include2"
],
"parent_id": null
}
However, when I run the SQL command below in pgAdmin, I get the old version (it didn't update the row in the database):
SELECT * FROM categories WHERE id=2
Here is the object I get:
{
"assigned_user": null,
"cat_name": "Category Name Before",
"excludes": [
"exclude1",
"excludeBefore"
],
"id": 2,
"includes": [
"include1",
"include2"
],
"parent_id": null
}
This is the object I had before doing the PUT request. If I restart my python server and do the GET request, then I get the old object. It feels like in the session, it is storing the data, but for some reason it's not propagating to the database.
It might be good to know that if I run the update command in pgAdmin, it updates the row just fine.
UPDATE: I have also used these methods (as talked about here) to update, but still the same problem:
# using the session to update
session.query(Category).filter_by(id=category.id).update({
"cat_name": category.id,
"assigned_user": category.assigned_user,
"includes": category.includes,
"excludes": category.excludes,
"parent_id": category.parent_id
})
# using the category object to edit, then commit
category_from_db = session.query(Category).filter_by(id=category.id).first()
category_from_db.cat_name = category_json['cat_name']
category_from_db.assigned_user = category_json['assigned_user']
category_from_db.excludes = category_json['excludes']
category_from_db.includes = category_json['includes']
category_from_db.parent_id = category_json['parent_id']
session.commit()
Any ideas?
It turns out that each time I called get_session, I was creating a new session. And I was not closing the session after every HTTP request.
Here is what the server/api/category.py PUT request looks like:
#api.route('/categories/<int:id>', methods=['PUT'])
def update_category(id):
category_json = request.json
if category_json is None:
return "Bad Request: Request not sent as json", 400
valid_json, json_err = is_valid_request(category_json, ['cat_name', 'parent_id', 'includes', 'excludes', 'assigned_user'], "and")
if not valid_json:
return json_err, 400
category = Category(
id=id,
cat_name=category_json['cat_name'],
parent_id=category_json['parent_id'],
includes=category_json['includes'],
excludes=category_json['excludes'],
assigned_user=category_json['assigned_user'],
)
session = get_session()
try:
updated_category = CategoryOperations.update(category, session)
session.commit()
print(updated_category.to_dict())
return jsonify(updated_category.to_dict()), 200
except Exception as e:
print("ROLLBACK")
print(e)
session.rollback()
return str(e), 500
finally: #
session.close() # <== The fix
Once I closed every session I opened after I was done with it, the problem was solved.
Hope this helps someone.

What Promoted Object should I use when creating an AdSet with lead_generation optimization goal?

I'm using facebookads python api, v2.6.
I'm trying to create an AdSet with optimization goal = lead_generation.
This is my code:
ad_set = AdSet(parent_id = 'act_%s' % FB_ACCOUNT)
ad_set[AdSet.Field.name]= 'Teste AdSet'
ad_set[AdSet.Field.campaign_id]='6043402838999'
ad_set[AdSet.Field.status]=AdSet.Status.paused
ad_set[AdSet.Field.billing_event] = AdSet.BillingEvent.impressions
ad_set[AdSet.Field.optimization_goal] = AdSet.OptimizationGoal.lead_generation
ad_set[AdSet.Field.daily_budget]= 100
ad_set[AdSet.Field.bid_amount]= 1
ad_set[AdSet.Field.start_time]= '2016-07-01'
ad_set[AdSet.Field.promoted_object]=
ad_set[AdSet.Field.targeting]= {Targeting.Field.geo_locations: { 'countries': ['BR']},Targeting.Field.genders: [1],Targeting.Field.age_min: 20,Targeting.Field.age_max: 24}
ad_set.remote_create()
But when I run this I get this error:
Status: 400
Response:
{
"error": {
"code": 100,
"is_transient": false,
"error_subcode": 1885024,
"error_user_msg": "When creating an ad set within a campaign using the Body of an error/warning message. Title is: Promoted Object Missing objective, a promoted object must be specified.",
"error_user_title": "Promoted Object Missing",
"message": "Invalid parameter",
"type": "OAuthException",
"fbtrace_id": "B9hyZlpzS7O"
}
}
I tried to find any documentation about this, but could not. On the official docs I don't see LEAD_GENERATION on the promoted objects options:
https://developers.facebook.com/docs/marketing-api/reference/ad-campaign#Creating
Anyone had this problem?
In case anyone has the same issue, you have to use page_id.
The ad set must have its promoted_object set to the corresponding <PAGE_ID>.
reference:
https://developers.facebook.com/docs/marketing-api/guides/lead-ads/create#create
you have to specify your associated page_id
promoted_object={"page_id": "<PAGE_ID>"}
Below code may help u
from facebook_business.adobjects.adaccount import AdAccount
from facebook_business.adobjects.adset import AdSet
from facebook_business.api import FacebookAdsApi
access_token = '<ACCESS_TOKEN>'
app_secret = '<APP_SECRET>'
app_id = '<APP_ID>'
id = '<AD_ACCOUNT_ID>'
FacebookAdsApi.init(access_token=access_token)
fields = [
]
params = {
'name': 'A CPA Ad Set',
'campaign_id': '<adCampaignLinkClicksID>',
'daily_budget': '5000',
'start_time': '2019-01-09T21:31:19-0800',
'end_time': '2019-01-16T21:31:19-0800',
'billing_event': 'IMPRESSIONS',
'optimization_goal': 'REACH',
'bid_amount': '1000',
'promoted_object': {'page_id':'<pageID>'},
'targeting': {'geo_locations':{'countries':['US']}},
'user_os': 'iOS',
'publisher_platforms': 'facebook',
'device_platforms': 'mobile',
}
print AdAccount(id).create_ad_set(
fields=fields,
params=params,
)

Django "is not JSON serializable" - ajax, views.py - How to implement?

I have read the documentation, but I am not exactly sure how to implement serializer.serialize for JSON objects in my view.py. If anyone can help me understand this a little better. I have the following code in my view.py:
#user_passes_test(lambda u: u.is_superuser)
def ProjDetails(request):
proj_id = request.GET['proj_id']
proj = Proj.objects.filter(id=proj_id)
role_list = ProjRole.objects.filter(proj=proj)
proj = {
"proj": proj,
"roles": []
}
for r in role_list:
proj['roles'].append(r.id)
return HttpResponse(json.dumps(proj), content_type='application/json; charset=UTF-8')
I am trying to call this with .ajax (I am still working on the ajax, so it probably is not right):
$('#proj_list #sel_proj').click(function(){
$('div.sel').removeClass("sel");
$(this).addClass("sel");
var project_id = $(this).data('id');
$.ajax({
url:'../../proj_details',
data: {proj_id: proj_id},
// dataType: 'html',
success: function(data){
$('#proj_display').html(data)
},
error: function () {
alert("Failed to find the project!")
}
});
Once I get the JSON call to work, then I will focus more on the ajax.
Biggest problem, I am getting a 500 http error with:
TypeError at ../proj_details
[<Project: Example>] is not JSON serializable
I am using Django 1.7, but I even added SESSION_SERIALIZER = 'django.contrib.sessions.serializers.PickleSerializer' to my settings.py without any luck. So I imported serializers from the django.core and tried to use serializer.serialize, but I am not understanding how to implement it I guess because my errors just keep getting worse. I have seen other posts with the same error, but still not understanding for my particular requirements.
+++++++++++++++ EDIT +++++++++++++++++++
So the only way I have been able to get this to work without multiple errors, circular errors, multiple argument errors, etc, is the following:
def ProjDetails(request):
def date_handler(obj):
return obj.strftime("%B %d, %Y") if hasattr(obj, 'strftime') else obj
proj_id = request.GET['proj_id']
proj = Proj.objects.get(id=proj_id)
corp = Corp.objects.get(id=proj.corp.id)
role_list = ProjRole.objects.filter(proj=proj).all()
proj = {
"proj": {
'title': proj.title,
'id': proj.id,
'date': proj.date,
'description': proj.description
}
"roles": [],
"company": {
'name': corp.name,
'pic': unicode(corp.pic),
}
}
for r in role_list:
proj['roles'].append(r.name)
return HttpResponse(json.dumps(proj, default=date_handler), content_type='application/json; charset=UTF-8')
The only thing I don't like about this is I actually have to manually pull what attributes I want from the model into the dictionary, instead of all the attributes being pulled from the model and then I can choose which ones I want to use in my templates. I would rather not have to pull everything like my example above. The 'roles' = [] is giving me some hiccups too because I can't seem to get it to work when there are multiple roles for a proj object.
I like Eugene's method because it would be cleaner, but I can't seem to get it to work with the corp model. The proj tables have a corp_id, yet I keep getting corp_id is not an attribute when I attempt it with using .value().get() for the proj object. I don't understand how to implement grzgrzgrz3's answer either. I usually work more with JS, HTML, and CSS, and I am new to Django/python for web development.
So any suggestions to make this more efficient would be great. Thank!!
Django model's instance can't be serialized, you should use values() method to retrieve dict instead of class instance. Also, you can use only() method to retrieve only id field for roles:
proj = Proj.objects.filter(id=proj_id).values().get()
role_list = ProjRole.objects.only("id").filter(proj__id=proj_id)
proj = {
"proj": proj,
"roles": role_list
}
Write custom HttpResponse and handle there all not serializable python/django objects.
class HttpJsonResponse(HttpResponse):
content_type="application/json"
def __init__(self,data):
def json_serial(obj):
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, datetime.date):
serial = obj.isoformat()
return serial
json_data = json.dumps(data, indent=4, default=json_serial)
super(HttpJsonResponse, self).__init__(json_data, self.content_type)
In the example function json_serial converting datetime.date object into string object which is serializable.
*UPDATE
You can mix both answers.
def ProjDetails(request):
proj_id = request.GET['proj_id']
proj = Proj.objects.filter(id=proj_id).values().get()
corp = Corp.objects.filter(id=proj.corp.id).values().get()
role_list = ProjRole.objects.filter(proj=proj).values().all()
proj = {
"proj": proj,
"roles": role_list,
"company": corp
}
return HttpJsonResponse(proj)
Make sure you are importing datetime module.
import datetime
instead datetime class
import datetime.datetime
My answer, as described up above. This is what worked for me.
def ProjDetails(request):
def date_handler(obj):
return obj.strftime("%B %d, %Y") if hasattr(obj, 'strftime') else obj
proj_id = request.GET['proj_id']
proj = Proj.objects.get(id=proj_id)
corp = Corp.objects.get(id=proj.corp.id)
role_list = ProjRole.objects.filter(proj=proj).all()
proj = {
"proj": {
'title': proj.title,
'id': proj.id,
'date': proj.date,
'description': proj.description
}
"roles": [],
"company": {
'name': corp.name,
'pic': unicode(corp.pic),
}
}
for r in role_list:
proj['roles'].append(r.name)
return HttpResponse(json.dumps(proj, default=date_handler), content_type='application/json; charset=UTF-8')

Searching ID or property for match in Mongo

Goal:
I want to allow the user to search for a document by ID, or allow other text-based queries.
Code:
l_search_results = list(
cll_sips.find(
{
'$or': [
{'_id': ObjectId(s_term)},
{'s_text': re.compile(s_term, re.IGNORECASE)},
{'choices': re.compile(s_term, re.IGNORECASE)}
]
}
).limit(20)
)
Error:
<Whatever you searched for> is not a valid ObjectId
s_term needs to be a valid object ID (or at least in the right format) when you pass it to the ObjectId constructor. Since it's sometimes not an ID, that explains why you get the exception.
Try something like this instead:
from pymongo.errors import InvalidId
or_filter = [
{'s_text': re.compile(s_term, re.IGNORECASE)},
{'choices': re.compile(s_term, re.IGNORECASE)}
]
try:
id = ObjectId(s_term)
or_filter.append({ '_id': id })
except InvalidId:
pass
l_search_results = list(
cll_sips.find({ '$or': or_filter }).limit(20)
)

Categories

Resources