Visualising data counts by day overlaid with key events - python

I'm working with twitter data related to two different keywords.I want to count the number of tweets per day for each keyword, visualise the results on a line graph, then overlay that with the dates of a range of significant calendar events.
My aim is to see whether tweet counts change around specific events. I've already counted and visualised the tweets, but am having problems figuring out how to overlay key dates.
I tried putting the important dates into a list, but it threw an error. Could anyone give me some pointers or suggest a better way to approach this?
Here's an image that gives a rough idea of what I'm trying to achieve:
https://imgur.com/a/36esk1B
dates_list = ['2016-06-16','2016-06-23', '2016-06-24',
'2016-07-02', '2016-07-13']
#then convert list into a Series
key_dates = pd.Series(pd.to_datetime(dates_list))
# add columns to identify important events, and mark a 0 or 1.
tweet_trend['Important Events'] = False
tweet_trend.loc[key_dates, 'Important Events'] = True
tweet_trend['values'] = 0
tweet_trend.loc[key_dates, 'values'] = 1
KeyError Traceback (most recent call last)
<ipython-input-88-04dd081adc28> in <module>
10 # add columns to identify important events, and mark a 0 or 1.
11 tweet_trend['Important Events'] = False
---> 12 tweet_trend.loc[key_dates, 'Important Events'] = True
13 tweet_trend['values'] = 0
14 tweet_trend.loc[key_dates, 'values'] = 1
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
187 else:
188 key = com.apply_if_callable(key, self.obj)
--> 189 indexer = self._get_setitem_indexer(key)
190 self._setitem_with_indexer(indexer, value)
191
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _get_setitem_indexer(self, key)
165 if isinstance(key, tuple):
166 try:
--> 167 return self._convert_tuple(key, is_setter=True)
168 except IndexingError:
169 pass
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_tuple(self, key, is_setter)
246 if i >= self.obj.ndim:
247 raise IndexingError('Too many indexers')
--> 248 idx = self._convert_to_indexer(k, axis=i, is_setter=is_setter)
249 keyidx.append(idx)
250 return tuple(keyidx)
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing)
1352 kwargs = {'raise_missing': True if is_setter else
1353 raise_missing}
-> 1354 return self._get_listlike_indexer(obj, axis, **kwargs)[1]
1355 else:
1356 try:
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1159 self._validate_read_indexer(keyarr, indexer,
1160 o._get_axis_number(axis),
-> 1161 raise_missing=raise_missing)
1162 return keyarr, indexer
1163
~/venv/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1250 if not(self.name == 'loc' and not raise_missing):
1251 not_found = list(set(key) - set(ax))
-> 1252 raise KeyError("{} not in index".format(not_found))
1253
1254 # we skip the warning on Categorical/Interval
KeyError: "[Timestamp('2016-06-16 00:00:00')] not in index"

You can use Index.isin for test membership and then casting column to integer for True/False to 1/0 mapping, also converting to Series is not necessary:
dates_list = ['2016-06-16','2016-06-23', '2016-06-24',
'2016-07-02', '2016-07-13']
key_dates = pd.to_datetime(dates_list)
tweet_trend['Important Events'] = df.index.isin(key_dates)
tweet_trend['values'] = tweet_trend['Important Events'].astype(int)

Related

How do I add a list to a column in pandas?

I'm trying to merge the columns kw1, kw2, kw3 shown here:
and have it in one separate column called keywords. This is what I tried:
df['keywords'] = list((df['kw1'], df['kw2'], df['kw3']))
df
but I'm getting this error:
ValueError Traceback (most recent call last)
Input In [13], in <cell line: 1>()
----> 1 df['keywords'] = list((df['kw1'], df['kw2'], df['kw3']))
2 df
File /lib/python3.10/site-packages/pandas/core/frame.py:3655, in DataFrame.__setitem__(self, key, value)
3652 self._setitem_array([key], value)
3653 else:
3654 # set column
-> 3655 self._set_item(key, value)
File /lib/python3.10/site-packages/pandas/core/frame.py:3832, in DataFrame._set_item(self, key, value)
3822 def _set_item(self, key, value) -> None:
3823 """
3824 Add series to DataFrame in specified column.
3825
(...)
3830 ensure homogeneity.
3831 """
-> 3832 value = self._sanitize_column(value)
3834 if (
3835 key in self.columns
3836 and value.ndim == 1
3837 and not is_extension_array_dtype(value)
3838 ):
3839 # broadcast across multiple columns if necessary
3840 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File /lib/python3.10/site-packages/pandas/core/frame.py:4535, in DataFrame._sanitize_column(self, value)
4532 return _reindex_for_setitem(value, self.index)
4534 if is_list_like(value):
-> 4535 com.require_length_match(value, self.index)
4536 return sanitize_array(value, self.index, copy=True, allow_2d=True)
File /lib/python3.10/site-packages/pandas/core/common.py:557, in require_length_match(data, index)
553 """
554 Check the length of data matches the length of the index.
555 """
556 if len(data) != len(index):
--> 557 raise ValueError(
558 "Length of values "
559 f"({len(data)}) "
560 "does not match length of index "
561 f"({len(index)})"
562 )
ValueError: Length of values (3) does not match length of index (141)
Is there a way to make it so that it turns it into a list like this [{value of kw1}, {value of kw2}, {value of kw3}]
You can do it like this
df['keywords'] = np.stack([df['kw1'], df['kw2'], df['kw3']], axis=1).tolist()
Pandas treats each element in the outermost list as a single value, so it complains that you only has three values (which are your three series) while you need 141 values for a new column since your original frame has 141 rows.
Stacking the underlying numpy arrays of the three series on the last dimension gives you a shape (141,3) and converting them to list gives you a list of length 141, with each element being another list of length 3.
A more concise way is to extract three columns as another df and let pandas do the stacking for you
df['keywords'] = df[['kw1', 'kw2', 'kw3']].values.tolist()

Unhashable list error when finding duplicates in a pandas dataframe

Hi this is really confusing me, as I am using one command on a large datframe:
df.duplicated(subset=None, keep='first)
This looks identical to what the documentation says of:
DataFrame.duplicated(subset=None, keep='first')
I'm just using df instead, however, all I get back is the following traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-53-529f7b7a97fb> in <module>()
----> 1 df.duplicated(subset=None, keep='first')
/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in duplicated(self, subset, keep)
4383 vals = (col.values for name, col in self.iteritems()
4384 if name in subset)
-> 4385 labels, shape = map(list, zip(*map(f, vals)))
4386
4387 ids = get_group_index(labels, shape, sort=False, xnull=False)
/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in f(vals)
4364 def f(vals):
4365 labels, shape = algorithms.factorize(
-> 4366 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
4367 return labels.astype('i8', copy=False), len(shape)
4368
/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
176 else:
177 kwargs[new_arg_name] = new_arg_value
--> 178 return func(*args, **kwargs)
179 return wrapper
180 return _deprecate_kwarg
/anaconda3/lib/python3.7/site-packages/pandas/core/algorithms.py in factorize(values, sort, order, na_sentinel, size_hint)
628 na_sentinel=na_sentinel,
629 size_hint=size_hint,
--> 630 na_value=na_value)
631
632 if sort and len(uniques) > 0:
/anaconda3/lib/python3.7/site-packages/pandas/core/algorithms.py in _factorize_array(values, na_sentinel, size_hint, na_value)
474 uniques = vec_klass()
475 labels = table.get_labels(values, uniques, 0, na_sentinel,
--> 476 na_value=na_value)
477
478 labels = _ensure_platform_int(labels)
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_labels()
TypeError: unhashable type: 'list'
What am I doing wrong?
From what I can understand, you got lists in your data frame and python or Pandas can not hash lists. You may have observed this, in case you ever tried to use lists as keys in a dictionary. A simple workaround would be to convert the lists to tuples which are hashable.

How to plot a pandas.core.series.Series as a bar graph?

I'm trying to plot a pandas series variable, which has a numeric id in one column and frequency of that id in the next column. I wish to plot these two as a bar graph with freq on the y-axis and id no. on the x-axis. However, there are too many rows, i.e. id numbers. Is there a way I can only plot the top 10 most frequently occurring ids?
executing this code - area_count.plot.bar
gives this error-
<bound method SeriesPlotMethods.bar of
<pandas.plotting._core.SeriesPlotMethods object at 0x0000019C68029908>>
I tried storing the top 20 values from this series into another variable using the following code:
for i in range(1,20):
f[i,:] = area_count[i,:]
But it showed this error:
ValueError Traceback (most recent call last)
<ipython-input-88-1020cb7bdfc3> in <module>
1 for i in range(1,20):
----> 2 f[i,:] = area_count[i,:]
~\Anaconda3\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
909 key = check_bool_indexer(self.index, key)
910
--> 911 return self._get_with(key)
912
913 def _get_with(self, key):
~\Anaconda3\lib\site-packages\pandas\core\series.py in _get_with(self, key)
921 elif isinstance(key, tuple):
922 try:
--> 923 return self._get_values_tuple(key)
924 except Exception:
925 if len(key) == 1:
~\Anaconda3\lib\site-packages\pandas\core\series.py in _get_values_tuple(self, key)
966
967 if not isinstance(self.index, MultiIndex):
--> 968 raise ValueError('Can only tuple-index with a MultiIndex')
969
970 # If key is contained, would have returned by now
ValueError: Can only tuple-index with a MultiIndex
If I understand you correctly, you now need the top 10 frequently occurring ids, do it by turning your series into a dataframe like:
x = df['id'].value_counts().sort_values(ascending = False).head(10).to_frame().reset_index()
x.rename(columns = {'index':'id', 'id': 'freq'}, inplace = True)
Now plot the graph:
x.plot.bar(x = 'id', y = 'freq')
Sample Output:

Column selection in Python

I am trying to find solution to below given problem but seems like I am going wrong with the approach
I have a set of Excel with some columns like ISBN, Title etc. The columns names in Excel are not properly formatted. ISBN is named as ISBN in some of the Excel files while it is named as ISBN-13, Alias, ISBN13 etc. in others. Similarly for Title and other columns.
I have read all these Excels as data frame in python using read Excel and used str.contains to find the columns based on substring. Please find code below:
searchfor = ['ISBN13','BAR CODE','ISBN NO#','ISBN','ISBN1','ISBN
13','ISBN_13','ITEM','ISBN NUMBER','ISBN No','ISBN-13','ISBN (13
DIGITS)','EAN','ALIAS','ITEMCODE']
searchfor1 = ['TITLE','BOOK NAME','NAME','TITLE
NAME','TITLES','BOOKNAME','BKDESC','PRODUCT NAME','ITEM DESCRIPTION','TITLE
18','COMPLETETITLE']
for f, i in zip(files_txt1, num1):
df = pd.read_excel(f,encoding='sys.getfilesystemencoding()')
df.columns = df.columns.str.upper()
df1['Isbn'] = df[df.columns[df.columns.str.contains('|'.join(searchfor))]]
df1['Title']=
df[df.columns[df.columns.to_series().str.contains('|'.join(searchfor1))]]
The code works fine if I have excel with text present in list. However throws error in case excel does not have any columns with name similar to list. Also code does not work for ISBN.
Please see detailed error below:
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\common.py in _asarray_tuplesafe(values, dtype)
376 result = np.empty(len(values), dtype=object)
--> 377 result[:] = values
378 except ValueError:
ValueError: could not broadcast input array from shape (31807,0) into shape (31807)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last) C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value) 2375 try:
-> 2376 value = Series(value) 2377 except:
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
247 data = _sanitize_array(data, index, dtype, copy,
--> 248 raise_cast_failure=True)
249
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure) 3028 else:
-> 3029 subarr = _asarray_tuplesafe(data, dtype=dtype) 3030
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\common.py in _asarray_tuplesafe(values, dtype)
379 # we have a list-of-list
--> 380 result[:] = [tuple(x) for x in values]
381
ValueError: cannot copy sequence with size 0 to array axis with dimension 31807
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last) <ipython-input-23-9e043c13fef2> in <module>()
11 df.columns = df.columns.str.upper()
12 #print(list(df.columns))
---> 13 df1['Isbn'] = df[df.columns[df.columns.str.contains('|'.join(searchfor))]]
14 df1['Title'] = df[df.columns[df.columns.to_series().str.contains('|'.join(searchfor1))]]
15 df1['Curr'] = df[df.columns[df.columns.to_series().str.contains('|'.join(searchfor2))]]
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value) 2329 else: 2330
# set column
-> 2331 self._set_item(key, value) 2332 2333 def _setitem_slice(self, key, value):
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value) 2394 """ 2395
-> 2396 self._ensure_valid_index(value) 2397 value = self._sanitize_column(key, value) 2398 NDFrame._set_item(self, key, value)
C:\Users\Ruchir_Kumar_Jha\AppData\Local\Enthought\Canopy\edm\envs\User\lib\site-packages\pandas\core\frame.py in _ensure_valid_index(self, value) 2376 value = Series(value) 2377 except:
-> 2378 raise ValueError('Cannot set a frame with no defined index ' 2379 'and a value that cannot be converted to a ' 2380 'Series')
ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series
You don't need this all, if you know your columns beforehand, just try at teh time of creating dataFrame and exporting the File into Pandas itself that way you will reduce the memory usage significantly as well.
df = pd.read_csv(file_name, usecols=['ISBN13','BAR CODE','ISBN NO#','ISBN','ISBN1','ISBN 13','ISBN_13','ITEM','ISBN NUMBER','ISBN No','ISBN-13','ISBN (13 DIGITS)','EAN','ALIAS','ITEMCODE']).fillna('')
This would work as long as you have no match or exactly 1 match
searchfor = ['ISBN13','BAR CODE','ISBN NO#','ISBN','ISBN1','ISBN 13','ISBN_13','ITEM','ISBN NUMBER','ISBN No','ISBN-13','ISBN (13 DIGITS)','EAN','ALIAS','ITEMCODE']
searchfor1 = ['TITLE','BOOK NAME','NAME','TITLE NAME','TITLES','BOOKNAME','BKDESC','PRODUCT NAME','ITEM DESCRIPTION','TITLE 18','COMPLETETITLE']
for f, i in zip(files_txt1, num1):
df = pd.read_excel(f,encoding='sys.getfilesystemencoding()')
df.columns = df.columns.str.upper()
cols = df.columns
is_isbn = cols.isin(searchfor)
df1['Isbn'] = df[cols[is_isbn]] if is_isbn.any() else None
is_title = cols.isin(searchfor1)
df1['Title'] = df[cols[is_title]] if is_title.any() else None

Error: all arrays must be same length. But they ARE the same length

I am doing some work about sentiment analysis, here I have three arrays:the content of the sentences, the sentiment score and the key words.
I want to display them as a dataframe by pandas, but I got :
"ValueError: arrays must all be same length"
Here are some of my codes:
print(len(text_sentences),len(score_list),len(keyinfo_list))
df = pd.DataFrame(text_sentences,score_list,keyinfo_list)
print(df)
Here are the results:
182 182 182
ValueError Traceback (most recent call last)
<ipython-input-15-cfb70aca07d1> in <module>()
21 print(len(text_sentences),len(score_list),len(keyinfo_list))
22
---> 23 df = pd.DataFrame(text_sentences,score_list,keyinfo_list)
24
25 print(df)
E:\learningsoft\anadonda\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
328 else:
329 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 330 copy=copy)
331 else:
332 mgr = self._init_dict({}, index, columns, dtype=dtype)
E:\learningsoft\anadonda\lib\site-packages\pandas\core\frame.py in _init_ndarray(self, values, index, columns, dtype, copy)
472 raise_with_traceback(e)
473
--> 474 index, columns = _get_axes(*values.shape)
475 values = values.T
476
E:\learningsoft\anadonda\lib\site-packages\pandas\core\frame.py in _get_axes(N, K, index, columns)
439 columns = _default_index(K)
440 else:
--> 441 columns = _ensure_index(columns)
442 return index, columns
443
E:\learningsoft\anadonda\lib\site-packages\pandas\core\indexes\base.py in _ensure_index(index_like, copy)
4015 if len(converted) > 0 and all_arrays:
4016 from .multi import MultiIndex
-> 4017 return MultiIndex.from_arrays(converted)
4018 else:
4019 index_like = converted
E:\learningsoft\anadonda\lib\site-packages\pandas\core\indexes\multi.py in from_arrays(cls, arrays, sortorder, names)
1094 for i in range(1, len(arrays)):
1095 if len(arrays[i]) != len(arrays[i - 1]):
-> 1096 raise ValueError('all arrays must be same length')
1097
1098 from pandas.core.categorical import _factorize_from_iterables
ValueError: all arrays must be same length
You can see all my three arrays contain 182 elements, so I don't understand why it said "all arrays must be same length".
You're passing the wrong data into pandas.DataFrame's initializer.
The way you're using it, you're essentially running:
pandas.DataFrame(data=text_sentences, index=score_list, columns=keyinfo_list)
This isn't what you want. You probably want to do something like this instead:
pd.DataFrame(data={
'sentences': text_sentences,
'scores': score_list,
'keyinfo': keyinfo_list
})

Categories

Resources