Looking to index a CSV file to ElasticSearch, without using Logstash.
I am using the elasticsearch-dsl high level library.
Given a CSV with header for example:
name,address,url
adam,hills 32,http://rockit.com
jane,valleys 23,http://popit.com
What will be the best way to index all the data by the fields? Eventually I'm looking to get each row to look like this
{
"name": "adam",
"address": "hills 32",
"url": "http://rockit.com"
}
This kind of task is easier with the lower-level elasticsearch-py library:
from elasticsearch import helpers, Elasticsearch
import csv
es = Elasticsearch()
with open('/tmp/x.csv') as f:
reader = csv.DictReader(f)
helpers.bulk(es, reader, index='my-index', doc_type='my-type')
If you want to create elasticsearch database from .tsv/.csv with strict types and model for a better filtering u can do something like that :
class ElementIndex(DocType):
ROWNAME = Text()
ROWNAME = Text()
class Meta:
index = 'index_name'
def indexing(self):
obj = ElementIndex(
ROWNAME=str(self['NAME']),
ROWNAME=str(self['NAME'])
)
obj.save(index="index_name")
return obj.to_dict(include_meta=True)
def bulk_indexing(args):
# ElementIndex.init(index="index_name")
ElementIndex.init()
es = Elasticsearch()
//here your result dict with data from source
r = bulk(client=es, actions=(indexing(c) for c in result))
es.indices.refresh()
Related
I want to get all the data of a lead object
and I did this script to obtain the data of a lead
and i only get an amount of 2000.
What do I have to do to get all the amount in
Salesforce of an object by pyhon?
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"Id",
"Email",
"Company",
"Address",
"Phone",
"FirstName",
"LastName",
"CreatedDate",
]
sosl = 'SELECT {0[0]}, {0[1]}, {0[2]}, {0[3]}, {0[4]}, {0[5]}, {0[6]} , {0[7]} , {0[8]} FROM Lead '.format(
columns)
# Data acquisition with SOSL
data = sf.query(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="") as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
d = datetime.strptime(
data['CreatedDate'], '%Y-%m-%dT%H:%M:%S.%f%z')
data['CreatedDate'] = d.strftime('%Y-%m-%d %H:%M:%S')
writer.writerow(data)
if __name__ == '__main__':
main()
If anyone knows, please let me know.
You can obtain all responsive records via the query_all() or query_all_iter() methods, which are documented under Queries in the simple_salesforce documentation.
Note that the query you are running is SOQL, not SOSL. SOSL is for full-text search.
wanted to transfer following csv to elsticsearch
|hcode|hname|
|1|aaaa|
|2|bbbbb|
|3|ccccc|
|4|dddd|
|5|eeee|
|6|ffff|
and need to insert hcode field as document_id. getting below error
File "C:\Users\Namali\Anaconda3\lib\site-packages\elasticsearch\connection\base.py", line 181, in _raise_error
status_code, error_message, additional_info
RequestError: RequestError(400, 'mapper_parsing_exception', 'failed to parse')"
use elasticseach version is 7.1.1 and python vervion is 3.7.6
Python code-----------------------------------------------------------------
import csv
import json
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
def csv_reader(file_obj, delimiter=','):
reader_ = csv.reader(file_obj,delimiter=delimiter,quotechar='"')
i = 1
results = []
for row in reader_:
#try :
#es.index(index='hb_hotel_raw', doc_type='hb_hotel_raw', id=row[0],
# body=json.dump([row for row in reader_], file_obj))
es.index(index='test', doc_type='test', id=row[0],body=json.dumps(row))
#except:
# print("error")
i = i + 1
results.append(row)
print(row)
if __name__ == "__main__":
with open("D:\\namali\\rez\\data_mapping\\test.csv") as f_obj:
csv_reader(f_obj)
First, doc_type is omitted in the elasticsearch 7. Second, you need to pass a valid json to elasticsearch. I edited your code as below:
for row in reader_:
_id = row[0].split("|")[1]
text = row[0].split("|")[2]
my_dict = {"hname" : text}
es.index(index='test', id=_id, body=my_dict)
<disclosure: I'm the developer of Eland and employed by Elastic>
If you're willing to load the CSV into a Pandas DataFrame you can use Eland to create/append the tabular data to an Elasticsearch index with all data types resolved properly.
I would recommend reading pandas.read_csv() and eland.pandas_to_eland() function documentation for ideas on how to accomplish this.
I have a list with a JSON like so:
print(type(listed)) # <class 'list'>
print (listed)
[
{
"email": "x#gmail.com",
"fullname": "xg gf",
"points": 5,
"image_url", "https://imgur.com/random.pmg"
},
{
... similar json for the next user and so on
}
]
I'm trying to insert them into my postgres database that has a model like this:
class Users(db.Model):
__tablename__ = 'users'
email = db.Column(db.String(), primary_key=True)
displayName = db.Column(db.String())
image = db.Column(db.String())
points = db.Column(db.Integer())
But I'm quite stuck, I've tried several approaches but none worked, anyone can guide me with an example on how to do it properly?
Here's a solution without pandas, using SQLAlchemy Core
create engine
engine = sqlalchemy.create_engine('...')
load the metadata using the engine as the bind parameter
metadata = sqalchemy.Metadata(bind=engine)
make a reference to the table
users_table = sqlalchemy.Table('users', metadata, autoload = True)
you can then start your inserts
for user in json:
query = users_table.insert()
query.values(**user)
my_session = Session(engine)
my_session.execute(query)
my_session.close()
This creates a session for every user in json, but I thought you might like it anyway. Its very flexible and works for any table, you don't even need a model. Just make sure the json doesnt contain any columns that dont exist in the db (this means you will need to use "img_url" (column name) in both the json key and in the db column name)
Here is an example json list, like you provided.
json = [
{
"email": "x#gmail.com",
"fullname": "xg gf",
"points": 5,
"image_url": "https://imgur.com/random.pmg"
},
{
"email": "onur#gmail.com",
"fullname": "o g",
"points": 7,
"image_url": "https://imgur.com/random_x.pmg"
}
]
Now create an empty dataframe all_df and run iterations inside your json list.
Each iteration creates a dataframe with the data from dictionary inside the list, transpose it and append to all_df.
import pandas as pd
all_df = pd.DataFrame()
for i in json:
df = pd.DataFrame.from_dict(data=i, orient='index').T
all_df = all_df.append(df)
Output:
Now you can go ahead create a session to your database and push all_df
all_df.to_sql(con=your_session.bind, name='your_table_name', if_exists='your_preferred_method', index=False)
Using marshmallow-sqlalchemy
validate the incoming JSON
create general utilities for loading and dumping data
Define schemas
schema.py
from marshmallow import EXCLUDE
from marshmallow_sqlalchemy import ModelSchema
from app import db
class UserSchema(ModelSchema):
class Meta(ModelSchema.Meta):
model = Users
sqla_session = db.session
user_schema_full = UserSchema(only=(
'email',
'displayName',
'image',
'points'
))
utils.py
Exact details below don't matter but create general utility for going from JSON to ORM objects and ORM objects to JSON. schema_partial used for auto generated primary keys.
def loadData(data, schema_partial, many=False,
schema_full=None, instance=None):
try:
if instance is not None:
answer = schema_full.load(data, instance=instance, many=many)
else:
answer = schema_partial.load(data, many=many)
except ValidationError as errors:
raise InvalidData(errors, status_code=400)
return answer
def loadUser(data, instance=None, many=False):
return loadData(data=data,
schema_partial=user_schema_full,
many=many,
schema_full=user_schema_full,
instance=instance)
def dumpData(load_object, schema, many=False):
try:
answer = schema.dump(load_object, many=many)
except ValidationError as errors:
raise InvalidDump(errors, status_code=400)
return answer
def dumpUser(load_object, many=False):
return dumpData(load_object, schema=user_schema_full, many=many)
Use loadUser and dumpUser within api to produce clean flat code.
api.py
#app.route('/users/', methods=['POST'])
def post_users():
"""Post many users"""
users_data = request.get_json()
users = loadUser(users_data, many=True)
for user in users:
db.session.add(user)
object_dump = dumpUser(users, many=True)
db.session.commit()
return jsonify(object_dump), 201
I'm getting data from an API and saving into a list called data[].
After this, I'm sending these data to another class to format it.
I want to create a dict, so I can save these data. I'm trying to do something like this:
import json
import datetime
class MovieFormatter():
def format(self, data):
data = {
"movie_info_name": data['title']
"movie_info_duration": data['duration']
"movie_info_description": data['synopsis']
"movie_info_genres": data['genres']
"movie_info_actor": data['cast']
"movie_info_director": data['director']
data['trailers'] = data.get('trailers') or []
"dictionary": [{'url': x['url'], 'type': x['type']} for x in data['images'] + data['trailers']]
}
return data
Is this the right way to do?
It seems that the data object passed to your function is already a dictionary, from the way it has been indexed e.g. data['title'].
Try this :
_in_data = ["a","b","c"]
_out_data = ["x","y","z"]
_dict={}
print(_dict)
for i in range (len(_in_data)):
_dict[_in_data[i]]=_out_data[i]
print(_dict)
Examples I found so far is streaming json to BQ, e.g. https://cloud.google.com/bigquery/streaming-data-into-bigquery
How do I stream Csv or any file type into BQ? Below is a block of code for streaming and seems "issue" is in insert_all_data where 'row' defined as json.. thanks
# [START stream_row_to_bigquery]
def stream_row_to_bigquery(bigquery, project_id, dataset_id, table_name, row,
num_retries=5):
insert_all_data = {
'rows': [{
'json': row,
# Generate a unique id for each row so retries don't accidentally
# duplicate insert
'insertId': str(uuid.uuid4()),
}]
}
return bigquery.tabledata().insertAll(
projectId=project_id,
datasetId=dataset_id,
tableId=table_name,
body=insert_all_data).execute(num_retries=num_retries)
# [END stream_row_to_bigquery]
This is how I wrote using bigquery-python library very easily.
def insert_data(datasetname,table_name,DataObject):
client = get_client(project_id, service_account=service_account,
private_key_file=key, readonly=False, swallow_results=False)
insertObject = DataObject
try:
result = client.push_rows(datasetname,table_name,insertObject)
except Exception, err:
print err
raise
return result
Here insertObject is a list of dictionaries where one dictionary contains one row.
eg: [{field1:value1, field2:value2},{field1:value3, field2:value4}]
csv can be read as follows,
import pandas as pd
fileCsv = pd.read_csv(file_path+'/'+filename, parse_dates=C, infer_datetime_format=True)
data = []
for row_x in range(len(fileCsv.index)):
i = 0
row = {}
for col_y in schema:
row[col_y['name']] = _sorted_list[i]['col_data'][row_x]
i += 1
data.append(row)
insert_data(datasetname,table_name,data)
data list can be sent to the insert_data
This will do that but still there's a limitation that I already raised here.