how to import pandas dataframe to elasticsearch index in python?

how to import pandas dataframe to elasticsearch index in python? - python

I have a dataframe df . I have to copy previous month data in the present month.
My approach so far now-
es = Elasticsearch(localhost creds)
body = {"query": {"bool": {"must": [{"range": {"time_document": {"gte": "now+1m-1M/M", "lt": "now/M"}}}]}}}
mydate = datetime.datetime.now()
month_number = mydate.strftime("%b").upper()
year_number = mydate.strftime("%Y")
month = mydate.strftime("%m")
year = mydate.strftime("%Y")
day = mydate.strftime("%d")
time_document = str(year) + '-' + str(month) + '-01'
results = elasticsearch.helpers.scan(es, query=body, index="test_data")
df = pd.DataFrame.from_dict([document['_source'] for document in results])
df['time'] = time_document
##Below way to import the dataframe to ES index. But the _id being created in the below function is not clear to me - Reference -
https://towardsdatascience.com/exporting-pandas-data-to-elasticsearch-724aa4dd8f62
def doc_generator(df):
df_iter = df.iterrows()
for index, document in df_iter:
yield {
"_index": 'test_data',
"_type": "_doc",
"_id" : f"{document['id']}"
}
raise StopIteration
helpers.bulk(es, doc_generator(df))
Please help me out, if there is any other way to do so and how is the _id being created here?

Related

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value

You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

Python Append Data from Loop into Data frame

I created this code where I am able to pull the data I want but not able to sort it as it should be. I am guessing it has to do with the way I am appending each item by ignoring index but I can't find my way around it.
This is my code:
import json
import pandas as pd
#load json object
with open("c:\Sample.json","r",encoding='utf-8') as file:
data = file.read()
data2 = json.loads(data)
print("Type:", type(data2))
cls=['Image', 'Email', 'User', 'Members', 'Time']
df = pd.DataFrame(columns = cls )
for d in data2['mydata']:
for k,v in d.items():
#print(k)
if k == 'attachments':
#print(d.get('attachments')[0]['id'])
image = (d.get('attachments')[0]['id'])
df=df.append({'Image':image},ignore_index = True)
#df['Message'] = image
if k == 'author_user_email':
#print(d.get('author_user_email'))
email = (d.get('author_user_email'))
df=df.append({'Email':email}, ignore_index = True)
#df['Email'] = email
if k == 'author_user_name':
#print(d.get('author_user_name'))
user = (d.get('author_user_name'))
df=df.append({'User':user}, ignore_index = True)
#df['User'] = user
if k == 'room_name':
#print(d.get('room_name'))
members = (d.get('room_name'))
df=df.append({'Members':members}, ignore_index = True)
#df['Members'] = members
if k == 'ts_iso':
#print(d.get('ts_iso'))
time = (d.get('ts_iso'))
df=df.append({'Time':time}, ignore_index = True)
#df['Time'] = time
df
print('Finished getting Data')
df1 = (df.head())
print(df)
print(df.head())
df.to_csv(r'c:\sample.csv', encoding='utf-8')
The code gives me this as the result
I am looking to get this
Data of the file is this:
{
"mydata": [
{
"attachments": [
{
"filename": "image.png",
"id": "888888888"
}
],
"author_user_email": "email#email.com",
"author_user_id": "91",
"author_user_name": "Marlone",
"message": "",
"room_id": "999",
"room_members": [
{
"room_member_id": "91",
"room_member_name": "Marlone"
},
{
"room_member_id": "9191",
"room_member_name": " +16309438985"
}
],
"room_name": "SMS [Marlone] [ +7777777777]",
"room_type": "sms",
"ts": 55,
"ts_iso": "2021-06-13T18:17:32.877369+00:00"
},
{
"author_user_email": "email#email.com",
"author_user_id": "21",
"author_user_name": "Chris",
"message": "Hi",
"room_id": "100",
"room_members": [
{
"room_member_id": "21",
"room_member_name": "Joe"
},
{
"room_member_id": "21",
"room_member_name": "Chris"
}
],
"room_name": "Direct [Chris] [Joe]",
"room_type": "direct",
"ts": 12345678910,
"ts_iso": "2021-06-14T14:42:07.572479+00:00"
}]}
Any help would be appreciated. I am new to python and am learning on my own.

Try:
import json
import pandas as pd
with open("your_data.json", "r") as f_in:
data = json.load(f_in)
tmp = []
for d in data["mydata"]:
image = d.get("attachments", [{"id": None}])[0]["id"]
email = d.get("author_user_email")
user = d.get("author_user_name")
members = d.get("room_name")
time = d.get("ts_iso")
tmp.append((image, email, user, members, time))
df = pd.DataFrame(tmp, columns=["Image", "Email", "User", "Members", "Time"])
print(df)
Prints:
Image Email User Members Time
0 888888888 email#email.com Marlone SMS [Marlone] [ +7777777777] 2021-06-13T18:17:32.877369+00:00
1 None email#email.com Chris Direct [Chris] [Joe] 2021-06-14T14:42:07.572479+00:00

Although the other answer does work, pandas has a built in reader for json files pd.read_json: https://pandas.pydata.org/pandas-docs/version/1.1.3/reference/api/pandas.read_json.html
It has the benefit of being able to handle very large datasets via chunking, as well as processing quite a few different formats. The other answer would not be performant for a large dataset.
This would get you started:
import pandas as pd
df = pd.read_json("c:\Sample.json")

The probblem is that append() adds a new row. So, you have to use at[] https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.at.html specifying the index/row. Se below. Some print/debug messages were left and path to input and output files was changed a little because I'm on Linux.
import json
import pandas as pd
import pprint as pp
#load json object
with open("Sample.json","r",encoding='utf-8') as file:
data = file.read()
data2 = json.loads(data)
#pp.pprint(data2)
cls=['Image', 'Email', 'User', 'Members', 'Time']
df = pd.DataFrame(columns = cls )
pp.pprint(df)
index = 0
for d in data2['mydata']:
for k,v in d.items():
#print(k)
if k == 'attachments':
#print(d.get('attachments')[0]['id'])
image = (d.get('attachments')[0]['id'])
df.at[index, 'Image'] = image
#df['Message'] = image
if k == 'author_user_email':
#print(d.get('author_user_email'))
email = (d.get('author_user_email'))
df.at[index, 'Email'] = email
#df['Email'] = email
if k == 'author_user_name':
#print(d.get('author_user_name'))
user = (d.get('author_user_name'))
df.at[index, 'User'] = user
#df['User'] = user
if k == 'room_name':
#print(d.get('room_name'))
members = (d.get('room_name'))
df.at[index, 'Members'] = members
#df['Members'] = members
if k == 'ts_iso':
#print(d.get('ts_iso'))
time = (d.get('ts_iso'))
df.at[index, 'Time'] = time
#df['Time'] = time
index += 1
# start indexing from 0
df.reset_index()
# replace empty str/cells witn None
df.fillna('None', inplace=True)
pp.pprint(df)
print('Finished getting Data')
df1 = (df.head())
print(df)
print(df.head())
df.to_csv(r'sample.csv', encoding='utf-8')

Databricks - Pyspark - Handling nested json with a dynamic key

I have an example json data file which has the following structure:
{
"Header": {
"Code1": "abc",
"Code2": "def",
"Code3": "ghi",
"Code4": "jkl",
},
"TimeSeries": {
"2020-11-25T03:00:00+00:00": {
"UnitPrice": 1000,
"Amount": 10000,
},
"2020-11-26T03:00:00+00:00": {
"UnitPrice": 1000,
"Amount": 10000,
}
}
}
When I parse this into databricks with command:
df = spark.read.json("/FileStore/test.txt")
I get as output 2 objects: Header and TimeSeries. With the TimeSeries I want to be able to flatten the structure so it has the following schema:
Date
UnitPrice
Amount
As the date field is a key, I am currently only able to access it via iterating through the column names and then using this in the dot-notation dynamically:
def flatten_json(data):
columnlist = data.select("TimeSeries.*")
count = 0
for name in data.select("TimeSeries.*"):
df1 = data.select("Header.*").withColumn(("Timeseries"), lit(columnlist.columns[count])).withColumn("join", lit("a"))
df2 = data.select("TimeSeries." + columnlist.columns[count] + ".*").withColumn("join", lit("a"))
if count == 0:
df3 = df1.join(df2, on=['join'], how="inner")
else:
df3 = df3.union(df1.join(df2, on=['join'], how="inner"))
count = count + 1
return(df3)
This is far from ideal. Does anyone know a better method to create the described dataframe?

The idea:
Step 1: Extract Header and TimeSeries separately.
Step 2: For each field in the TimeSeries object, extract the Amount and UnitPrice, together with the name of the field, stuff them into a struct.
Step 3: Merge all these structs into an array column, and explode it.
Step 4: Extract Timeseries, Amount and UnitPrice from the exploded column.
Step 5: Cross join with the Header row.
import pyspark.sql.functions as F
header_df = df.select("Header.*")
timeseries_df = df.select("TimeSeries.*")
fieldNames = enumerate(timeseries_df.schema.fieldNames())
cols = [F.struct(F.lit(name).alias("Timeseries"), col(name).getItem("Amount").alias("Amount"), col(name).getItem("UnitPrice").alias("UnitPrice")).alias("ts_" + str(idx)) for idx, name in fieldNames]
combined = explode(array(cols)).alias("comb")
timeseries = timeseries_df.select(combined).select('comb.Timeseries', 'comb.Amount', 'comb.UnitPrice')
result = header_df.crossJoin(timeseries)
result.show(truncate = False)
Output:
+-----+-----+-----+-----+-------------------------+------+---------+
|Code1|Code2|Code3|Code4|Timeseries |Amount|UnitPrice|
+-----+-----+-----+-----+-------------------------+------+---------+
|abc |def |ghi |jkl |2020-11-25T03:00:00+00:00|10000 |1000 |
|abc |def |ghi |jkl |2020-11-26T03:00:00+00:00|10000 |1000 |
+-----+-----+-----+-----+-------------------------+------+---------+

Facebook API offline conversion event param data must be an array error

I am trying to upload events to the offline conversion dataset for Facebook with some custom fields, But I am receiving the following error
Status: 400
Response:
{
"error": {
"message": "(#100) param data must be an array.",
"type": "OAuthException",
"code": 100,
"fbtrace_id": "A5qsezd_MfvKEYYTVfPcu29"
}
}
I am referring to this page to upload offline events.
https://developers.facebook.com/docs/marketing-api/offline-conversions/
CSV structure
email,event_name,event_time,value,dept,brand,tx_type,cust_type,cust_trend
79FBB38FC843911533020FD8DE5B29CBA9958F,Purchase,2020-06-15T07:42:47Z,100.25, RENTAL,NAN,PA,Active,Growth (+15% to LY)
8EF89542E99BF7D8C0D4AA9F218,Purchase,2020-06-15T17:46:13Z,50,DEPOSITS, NAN,Other,Active,Declined (-15% to LY)
4C83B542E9C9566AA8D6A5279839115E7C0C454A1,Purchase,2020-06-15T09:55:01Z,150,DEPOSITS, NAN,PA,Active,Declined (-15% to LY)
361604C2B8FC67,Purchase,2020-06-15T15:41:18Z,50,DEPOSITS, NAN,OtherNew (Less than 3 Months),Did Not Shop LY
09133B0CDFA527BA9013CA8F1A0382D76F9,Purchase,2020-06-15T08:44:47Z,1,DEPOSITS, NAN,PX,Active,Growth (+15% to LY)
50cff131E2B3042C6E533ss225146C37994E2C2,Purchase,2020-06-15T07:35:50Z,300,DEPOSITS, NAN,Other,ActiveGrowth (+10% to LY)
ECD35DBB79FF37B0FC95E131,Purchase,2020-06-15T16:13:28Z,50,DEPOSITS, NAN,PX,Active,Decline (-12% to LY)
code:
def upload_offline_conversion(**args):
from facebook_business.adobjects.offlineconversiondataset import OfflineConversionDataSet
from facebook_business.api import FacebookAdsApi
import pandas as pd
#import gcsfs
import json
access_token = access_token
FacebookAdsApi.init(app_id=app_id,access_token=access_token)
offline_dataset = OfflineConversionDataSet(dataset_id)
df = pd.read_csv('UPLOADS.csv',sep =',')
df['event_time'] = (pd.to_datetime(df['event_time']).astype(int) / 10 ** 9).astype(int).astype(str)
df['match_keys'] = df.apply(lambda row: json.dumps({k: [row[k]] if k in ['email'] else row[k] for k in ['email'] if pd.notnull(row[k])}), axis=1)
del df['email'] # deleting match_keys single columns since they are now useless
df["currency"]='CAD'
data = (df.groupby(['event_name','event_time','match_keys','value','currency'], as_index=False)
.apply(lambda x: x[['dept','brand','tx_type','cust_type','cust_trend']].to_dict('r'))
.reset_index()
.rename(columns={0:'custom_data'}).to_json(orient='records'))
print(data)
batch_limit = 2000 # Maximum number of events permitted in a single call
for i in range(0, len(data), batch_limit):
params = {
'upload_tag': 'upload_test',
'data': data[i:i+batch_limit],
}
# print(params)
#offline_dataset.create_event(params=params)
expected o/p
data=[
{
match_keys: {"email": ['79FBB38FC843911533020FD8DE5B29CBA9958F']},
currency: "CAD",
value: 100.25,
event_name: "Purchase",
event_time: 1592206967,
custom_data: {
dept: "RENTAL",
brand:"NAN",
tx_type:"PA",
cust_type:"ACTIVE",
cust_trend:"Growth (+15% to LY)"
},
},
{
match_keys: {"email": ["8EF89542E99BF7D8C0D4AA9F218"]},
currency: "CAD",
value: 50,
event_name: "Purchase",
event_time: 1592243173,
custom_data: {
dept: "RENTAL",
brand:"NAN",
tx_type:"PA",
cust_type:"ACTIVE",
cust_trend:"Growth (+15% to LY)"
},
},
#and so on...................
]
my sample output :
{'upload_tag': 'sales_upload_test_final',
'data': '[
{"event_name":"Purchase",
"event_time":"1592243173",
"match_keys":"{"\\email\\": [\\"8EF89542E99BF7D8C0D4AA9F218"\\]}",
"value":"50",
"currency":"CAD",
"custom_data":[{"dept":"DEPOSITS","brand":" NAN","tx_type":"Other","cust_type":"Active","cust_trend":"Declined (-15% to LY)"}]}]}

need to specify LDU as of july 1st 2020.
Code:
def upload_offline_conversion(**args):
from facebook_business.adobjects.offlineconversiondataset import OfflineConversionDataSet
from facebook_business.api import FacebookAdsApi
import pandas as pd
#import gcsfs
import json
access_token = access_token
FacebookAdsApi.init(app_id=app_id,access_token=access_token)
offline_dataset = OfflineConversionDataSet(dataset_id)
df = pd.read_csv('UPLOADS.csv',sep =',')
df['event_time'] = (pd.to_datetime(df['event_time']).astype(int) / 10 ** 9).astype(int).astype(str)
df['match_keys'] = df.apply(lambda row: json.dumps({k: [row[k]] if k in ['email'] else row[k] for k in ['email'] if pd.notnull(row[k])}), axis=1)
del df['email'] # deleting match_keys single columns since they are now useless
df["currency"]='CAD'
data = (df.groupby(['event_name','event_time','match_keys','value','currency'], as_index=False)
.apply(lambda x: x[['dept','brand','tx_type','cust_type','cust_trend']].to_dict('r'))
.reset_index()
.rename(columns={0:'custom_data'}).to_dict(orient='records'))
df = pd.DataFrame(data)
df["data_processing_options"]= [[]] * df.shape[0] #(Value either [] or ["LDU"] )
data = df.to_dict(orient="records")
batch_limit = 2000 # Maximum number of events permitted in a single call
for i in range(0, len(data), batch_limit):
params = {
'upload_tag': 'upload_test',
'data': data[i:i+batch_limit],
}
# print(params)
#offline_dataset.create_event(
params=params)

Pandas - DateTime groupby to structured dict

I have a dataset which contains a DateTime field. I need to group by hours and dispatch each group to a dictionary with the following structure:
{year_1:
{month_1:
{week_1:
{day_1:
{hour_1: df_1, hour_2: df_2}
}
},
{week_2:
{day_1:
{hour_1: df_1}
}
}
},
{month_3:
{week_1:
{day_1:
{hour_1: df_1, hour_2: df_2}
}
}
},
year_2:
{month_5:
{week_1:
{day_1:
{hour_2: df_2}
}
}
}
}
To do that I am using the following code:
import pandas as pd
df = df = pd.DataFrame({'date': [pd.datetime(2015,3,17,2), pd.datetime(2014,3,24,3), pd.datetime(2014,3,17,4)], 'hdg_id': [4041,4041,4041],'stock': [1.0,1.0,1.0]})
df.loc[:,'year'] = [x.year for x in df['date']]
df.loc[:,'month'] = [x.month for x in df['date']]
df.loc[:,'week'] = [x.week for x in df['date']]
df.loc[:,'day'] = [x.day for x in df['date']]
df.loc[:,'hour'] = [x.hour for x in df['date']]
result = {}
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
try:
result[year]
except KeyError:
result[year] = {}
try:
result[year][month]
except KeyError:
result[year][month] = {}
try:
result[year][month][week]
except KeyError:
result[year][month][week] = {}
try:
result[year][month][week][day]
except KeyError:
result[year][month][week][day] = {}
result[year][month][week][day][hour] = df_hour
As you can see this is pretty much a brute-force solution and I was looking for something that looks more clean and understandable. Furthermore, it is also extremely slow. I tried different ways for grouping (Python Pandas Group by date using datetime data) and I also tried a multindex with each component of datetime (Pandas DataFrame with MultiIndex: Group by year of DateTime level values). However, the problem is always how to create the dict. Ideally, I would like just to write something like:
result[year][month][week][day][hour] = df_hour
but to the best of my knowledge, I first need to initialize each dict.

You need dict.setdefault
result = {}
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
result.setdefault(year, {}) \
.setdefault(month, {}) \
.setdefault(week, {}) \
.setdefault(day, {}) \
.setdefault(hour, df_hour)
You can also subclass dict to do this
class Fict(dict):
def __getitem__(self, item):
return super().setdefault(item, type(self)())
result = Fict()
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
result[year][month][week][day][hour] = df_hour

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to import pandas dataframe to elasticsearch index in python? - python

Related

Creating multiple dataframe using loop or function

Python Append Data from Loop into Data frame

Databricks - Pyspark - Handling nested json with a dynamic key

Facebook API offline conversion event param data must be an array error

Pandas - DateTime groupby to structured dict

Categories

Resources