Pandas - DateTime groupby to structured dict - python

I have a dataset which contains a DateTime field. I need to group by hours and dispatch each group to a dictionary with the following structure:
{year_1:
{month_1:
{week_1:
{day_1:
{hour_1: df_1, hour_2: df_2}
}
},
{week_2:
{day_1:
{hour_1: df_1}
}
}
},
{month_3:
{week_1:
{day_1:
{hour_1: df_1, hour_2: df_2}
}
}
},
year_2:
{month_5:
{week_1:
{day_1:
{hour_2: df_2}
}
}
}
}
To do that I am using the following code:
import pandas as pd
df = df = pd.DataFrame({'date': [pd.datetime(2015,3,17,2), pd.datetime(2014,3,24,3), pd.datetime(2014,3,17,4)], 'hdg_id': [4041,4041,4041],'stock': [1.0,1.0,1.0]})
df.loc[:,'year'] = [x.year for x in df['date']]
df.loc[:,'month'] = [x.month for x in df['date']]
df.loc[:,'week'] = [x.week for x in df['date']]
df.loc[:,'day'] = [x.day for x in df['date']]
df.loc[:,'hour'] = [x.hour for x in df['date']]
result = {}
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
try:
result[year]
except KeyError:
result[year] = {}
try:
result[year][month]
except KeyError:
result[year][month] = {}
try:
result[year][month][week]
except KeyError:
result[year][month][week] = {}
try:
result[year][month][week][day]
except KeyError:
result[year][month][week][day] = {}
result[year][month][week][day][hour] = df_hour
As you can see this is pretty much a brute-force solution and I was looking for something that looks more clean and understandable. Furthermore, it is also extremely slow. I tried different ways for grouping (Python Pandas Group by date using datetime data) and I also tried a multindex with each component of datetime (Pandas DataFrame with MultiIndex: Group by year of DateTime level values). However, the problem is always how to create the dict. Ideally, I would like just to write something like:
result[year][month][week][day][hour] = df_hour
but to the best of my knowledge, I first need to initialize each dict.

You need dict.setdefault
result = {}
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
result.setdefault(year, {}) \
.setdefault(month, {}) \
.setdefault(week, {}) \
.setdefault(day, {}) \
.setdefault(hour, df_hour)
You can also subclass dict to do this
class Fict(dict):
def __getitem__(self, item):
return super().setdefault(item, type(self)())
result = Fict()
for to_unpack, df_hour in df.groupby(['year','month','day','week','hour']):
year, month, week, day, hour = to_unpack
result[year][month][week][day][hour] = df_hour

Related

how to import pandas dataframe to elasticsearch index in python?

I have a dataframe df . I have to copy previous month data in the present month.
My approach so far now-
es = Elasticsearch(localhost creds)
body = {"query": {"bool": {"must": [{"range": {"time_document": {"gte": "now+1m-1M/M", "lt": "now/M"}}}]}}}
mydate = datetime.datetime.now()
month_number = mydate.strftime("%b").upper()
year_number = mydate.strftime("%Y")
month = mydate.strftime("%m")
year = mydate.strftime("%Y")
day = mydate.strftime("%d")
time_document = str(year) + '-' + str(month) + '-01'
results = elasticsearch.helpers.scan(es, query=body, index="test_data")
df = pd.DataFrame.from_dict([document['_source'] for document in results])
df['time'] = time_document
##Below way to import the dataframe to ES index. But the _id being created in the below function is not clear to me - Reference -
https://towardsdatascience.com/exporting-pandas-data-to-elasticsearch-724aa4dd8f62
def doc_generator(df):
df_iter = df.iterrows()
for index, document in df_iter:
yield {
"_index": 'test_data',
"_type": "_doc",
"_id" : f"{document['id']}"
}
raise StopIteration
helpers.bulk(es, doc_generator(df))
Please help me out, if there is any other way to do so and how is the _id being created here?

Creating multiple dataframe using loop or function

I'm trying to extract the hash rate for 3 cryptocurrencies and I have attached the code for the same below. Now, I want to pass three urls and in return I need three different different dictionaries which should have the values. I'm stuck and I don't understand how should I go about it. I have tried using loops but it is not working out for me.
url = {'Bitcoin' : 'https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y',
'Ethereum': 'https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y',
'Litecoin': 'https://bitinfocharts.com/comparison/litecoin-hashrate.html'}
for ele in url:
#### requesting the page and extracting the script which has date and values
session = requests.Session()
page = session.get(ele[i])
soup = BeautifulSoup(page.content, 'html.parser')
values = str(soup.find_all('script')[4])
values = values.split('d = new Dygraph(document.getElementById("container"),')[1]
#create an empty dict to append date and hashrates
dict([("crypto_1 %s" % i,[]) for i in range(len(url))])
#run a loop over all the dates and adding to dictionary
for i in range(values.count('new Date')):
date = values.split('new Date("')[i+1].split('"')[0]
value = values.split('"),')[i+1].split(']')[0]
dict([("crypto_1 %s" % i)[date] = value
You can use next example how to get data from all 3 URLs and create a dataframe/dictionary from it:
import re
import requests
import pandas as pd
url = {
"Bitcoin": "https://bitinfocharts.com/comparison/bitcoin-hashrate.html#3y",
"Ethereum": "https://bitinfocharts.com/comparison/ethereum-hashrate.html#3y",
"Litecoin": "https://bitinfocharts.com/comparison/litecoin-hashrate.html",
}
data = []
for name, u in url.items():
html_doc = requests.get(u).text
for date, hash_rate in re.findall(
r'\[new Date\("(.*?)"\),(.*?)\]', html_doc
):
data.append(
{
"Name": name,
"Date": date,
"Hash Rate": float("nan")
if hash_rate == "null"
else float(hash_rate),
}
)
df = pd.DataFrame(data)
df["Date"] = pd.to_datetime(df["Date"])
# here save df to CSV
# this will create a dictionary, where the keys are crypto names and values
# are dicts with keys Date/HashRate:
out = {}
for name, g in df.groupby("Name"):
out[name] = g[["Date", "Hash Rate"]].to_dict(orient="list")
print(out)
Prints:
{
"Bitcoin": {
"Date": [
Timestamp("2009-01-03 00:00:00"),
Timestamp("2009-01-04 00:00:00"),
Timestamp("2009-01-05 00:00:00"),
...

What is the data format returned by the AdWords API TargetingIdeaPage service?

When I query the AdWords API to get search volume data and trends through their TargetingIdeaSelector using the Python client library the returned data looks like this:
(TargetingIdeaPage){
totalNumEntries = 1
entries[] =
(TargetingIdea){
data[] =
(Type_AttributeMapEntry){
key = "KEYWORD_TEXT"
value =
(StringAttribute){
Attribute.Type = "StringAttribute"
value = "keyword phrase"
}
},
(Type_AttributeMapEntry){
key = "TARGETED_MONTHLY_SEARCHES"
value =
(MonthlySearchVolumeAttribute){
Attribute.Type = "MonthlySearchVolumeAttribute"
value[] =
(MonthlySearchVolume){
year = 2016
month = 2
count = 2900
},
...
(MonthlySearchVolume){
year = 2015
month = 3
count = 2900
},
}
},
},
}
This isn't JSON and appears to just be a messy Python list. What's the easiest way to flatten the monthly data into a Pandas dataframe with a structure like this?
Keyword | Year | Month | Count
keyword phrase 2016 2 10
The output is a sudsobject. I found that this code does the trick:
import suds.sudsobject as sudsobject
import pandas as pd
a = [sudsobject.asdict(x) for x in output]
df = pd.DataFrame(a)
Addendum: This was once correct but new versions of the API (I tested
201802) now return a zeep.objects. However, zeep.helpers.serialize_object should do the same trick.
link
Here's the complete code that I used to query the TargetingIdeaSelector, with requestType STATS, and the method I used to parse the data to a useable dataframe; note the section starting "Parse results to pandas dataframe" as this takes the output given in the question above and converts it to a dataframe. Probably not the fastest or best, but it works! Tested with Python 2.7.
"""This code pulls trends for a set of keywords, and parses into a dataframe.
The LoadFromStorage method is pulling credentials and properties from a
"googleads.yaml" file. By default, it looks for this file in your home
directory. For more information, see the "Caching authentication information"
section of our README.
"""
from googleads import adwords
import pandas as pd
adwords_client = adwords.AdWordsClient.LoadFromStorage()
PAGE_SIZE = 10
# Initialize appropriate service.
targeting_idea_service = adwords_client.GetService(
'TargetingIdeaService', version='v201601')
# Construct selector object and retrieve related keywords.
offset = 0
stats_selector = {
'searchParameters': [
{
'xsi_type': 'RelatedToQuerySearchParameter',
'queries': ['donald trump', 'bernie sanders']
},
{
# Language setting (optional).
# The ID can be found in the documentation:
# https://developers.google.com/adwords/api/docs/appendix/languagecodes
'xsi_type': 'LanguageSearchParameter',
'languages': [{'id': '1000'}],
},
{
# Location setting
'xsi_type': 'LocationSearchParameter',
'locations': [{'id': '1027363'}] # Burlington,Vermont
}
],
'ideaType': 'KEYWORD',
'requestType': 'STATS',
'requestedAttributeTypes': ['KEYWORD_TEXT', 'TARGETED_MONTHLY_SEARCHES'],
'paging': {
'startIndex': str(offset),
'numberResults': str(PAGE_SIZE)
}
}
stats_page = targeting_idea_service.get(stats_selector)
##########################################################################
# Parse results to pandas dataframe
stats_pd = pd.DataFrame()
if 'entries' in stats_page:
for stats_result in stats_page['entries']:
stats_attributes = {}
for stats_attribute in stats_result['data']:
#print (stats_attribute)
if stats_attribute['key'] == 'KEYWORD_TEXT':
kt = stats_attribute['value']['value']
else:
for i, val in enumerate(stats_attribute['value'][1]):
data = {'keyword': kt,
'year': val['year'],
'month': val['month'],
'count': val['count']}
data = pd.DataFrame(data, index = [i])
stats_pd = stats_pd.append(data, ignore_index=True)
print(stats_pd)

Nested Counter for json data

I have a JSON data as:
{
"persons": [
{
"city": "Seattle",
"name": "Brian"
"dob" : "19-03-1980"
},
{
"city": "Amsterdam",
"name": "David"
"dob" : "19-09-1979"
}
{
"city": "London",
"name": "Joe"
"dob" : "19-01-1980"
}
{
"city": "Kathmandu",
"name": "Brian"
"dob" : "19-03-1980"
}
]
}
How can I count the individual elements, like, number of person born in Jan-Dec (0 if none were born) and born in given year using python in one single iteration. Also the number of unique names registered in each month
Like:
1980 :3
--Jan:1
--Mar:2
1979 :1
--Sep:1
Names:
Mar 1980: 1 #Brian is same for both cities
Jan 1980: 1
Sep 1979: 1
counters_mon is the counter that has values for specific months of year
for k_mon,v_mon in counters_mon.items():
print('{}={}'.format(k_mon,v_mon))
But I want details too to be printed. How can I achieve this?
import json
f = open('/path/to/your/json', 'r')
persons = json.load(f)
years_months = {}
years_months_names = {}
for person in persons['persons']:
year = person['dob'][-4:]
month = person['dob'][3:5]
month_year = month + ' ' + year
name = person['name']
if year not in years_months.keys():
years_months[year] = { 'count': 1, 'months' : {} }
if month not in years_months[year]['months'].keys():
years_months[year]['months'][month] = 1
else:
years_months[year]['months'][month] += 1
else:
years_months[year]['count'] += 1
if month not in years_months[year]['months'].keys():
years_months[year]['months'][month] = 1
else:
years_months[year]['months'][month] += 1
if month_year not in years_months_names.keys():
years_months_names[month_year] = set([name])
else:
years_months_names[month_year].add(name)
for k, v in years_months.items():
print(k + ': ' + str(v['count']))
for month, count in v['months'].items():
print("-- " + str(month) + ": " + str(count))
for k, v in years_months_names.items():
print(k + ": " + str(len(v)))
I'm assuming that you have the path to your json. I also tested my answer on the JSON that you've posted, and be careful to make sure that your JSON is structured correctly.
This is a good case for using defaultdicts (https://docs.python.org/3/library/collections.html#collections.defaultdict).
data # assume you have your data in a var called data
from collections import defaultdict
from calendar import month_abbr
# slightly strange construction here but we want a 2 levels of defaultdict followed by lists
aggregate = defaultdict(lambda:defaultdict(list))
# then the population is super simple - you'll end up with something like
# aggregate[year][month] = [name1, name2]
for person in data['persons']:
day, month, year = map(int, person['dob'].split('-'))
aggregate[year][month].append(person['name'])
# I'm sorting in chronological order for printing
for year, months in sorted(aggregate.items()):
print('{}: {}'.format(year, sum(len(names) for names in months.values())))
for month, names in sorted(months.items()):
print('--{}: {}'.format(month_abbr[month], len(names)))
for year, months in sorted(aggregate.items()):
for month, names in sorted(months.items()):
print('{} {}: {}'.format(month_abbr[month], year, len(set(names))))
Depending on how the data was going to be used I'd actually consider not having the complex nesting in the aggregation and instead opt for something like aggregate[(year, month)] = [name1, name2,...]. I find that the more nested my data, the more confusing it is to work with.
EDIT Alternatively you can create several structures on the first pass so the printing step is simplified. Again, I'm using defaultdict to clean up all the provisioning.
agg_years = defaultdict(lambda:defaultdict(int)) # [year][month] = counter
agg_years_total = defaultdict(int) # [year] = counter
agg_months_names = defaultdict(set) # [(year, month)] = set(name1, name2...)
for person in data['persons']:
day, month, year = map(int, person['dob'].split('-'))
agg_years[year][month] += 1
agg_years_total[year] += 1
agg_months_names[(year, month)].add(person['name'])
for year, months in sorted(agg_years.items()):
print('{}: {}'.format(year, agg_years_total[year]))
for month, quant in sorted(months.items()):
print('--{}: {}'.format(month_abbr[month], quant))
for (year, month), names in sorted(agg_months_names.items()):
print('{} {}: {}'.format(month_abbr[month], year, len(names)))

Python-mock: mocking pymongo's Cursor and Collection in the same test case

I want to make a mock test for some mongo queries, but I had a problem when I tried to mock two diferent mongo objects (Cursor and Collection) in the same scope.
I can't put the original code here, but I made a generic with the same problem.
This is the function I want to test (using find and distinct):
def get_some_info(date1, date2):
data = collection.find({"timestamp": {"$lt": date1, "$gt": date2})
id_list = data.distinct("id")
for id in id_list :
collection.find({"id" : id})
This is the test code:
#mock.patch.object(Collection, "find")
#mock.patch.object(Cursor, "distinct")
def test_get_some_info(self, mock_distinct, mock_find):
date1 = datetime.datetime()
date2 = datetime.datetime()
mock_distinct.return_value = ["id1", "id2"]
find_parameters1 = {"timestamp": {"$lt": date1, "$gt": date2}
find_parameters2 = {"id" : "id1"}
self.my_class.get_some_info(date1, date2)
mock_find.assert_called_with(find_parameters1)
mock_find.assert_any_call(find_parameters2)
When I run the test, I have this error message:
'%s call not found' % expected_string AssertionError: find({"id" :
"id1"}) call not found
So I printed the value of id_list:
def get_some_info(date1, date2):
data = collection.find({"timestamp": {"$lt": date1, "$gt": date2})
id_list = data.distinct("id")
print id_list
for id in id_list :
collection.find({"id" : id})
This is the value of id_list:
MagicMock name='find().distinct()' id='139719585597776'
But the expected is: ["id1", "id2"]
When I comment the Cursor's mock, the value of id_list is the same as before:
##mock.patch.object(collection, "distinct")
#mock.patch.object(Cursor, "find")
def test_get_some_info(self, mock_find):
date1 = datetime.datetime()
date2 = datetime.datetime()
mock_distinct.return_value = ["id1", "id2"]
find_parameters1 = {"timestamp": {"$lt": date1, "$gt": date2}
find_parameters2 = {"id" : "id1"}
self.my_class.get_some_info(date1, date2)
mock_find.assert_called_with(find_parameters1)
mock_find.assert_any_call(find_parameters2)
<MagicMock name='find().distinct()' id='140089689306448'>
I think the code is always catching the "distinct" value from Collection, not from Cursor.
Does anybody had the same problem?
mocks objects in test arguments should be in the reverse order (take a look to Nesting Patch Decorators for details)
What you called mock_collection should be mock_find
Your test code should be this:
#mock.patch.object(Cursor, "distinct")
#mock.patch.object(Collection, "find")
def test_get_some_info(self, mock_find, mock_distinct):
date1 = datetime.datetime()
date2 = datetime.datetime()
mock_distinct.return_value = ["id1", "id2"]
find_parameters1 = {"timestamp": {"$lt": date1, "$gt": date2}
find_parameters2 = {"id" : "id1"}
self.my_class.get_some_info(date1, date2)
mock_find.assert_called_with(find_parameters1)
mock_find.assert_any_call(find_parameters2)

Categories

Resources