I am using python 3.10 and here is my locust file.
from locust import HttpUser, task, between
import string
import random
import time
import datetime
WAIT_TIME_MIN = 1
WAIT_TIME_MAX = 5
h = {
"Content-Type": "application/json"
}
random.seed()
class LoadTest(HttpUser):
wait_time = between(WAIT_TIME_MIN, WAIT_TIME_MAX)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.path = None
def generate_random_string(self, min_name_size=2, max_name_size=20) -> str:
letters = string.ascii_lowercase
string_size = random.randint(min_name_size, max_name_size)
generated_string = ''.join(random.choice(letters) for i in range(string_size))
return generated_string
def generate_random_dob(self) -> str:
d = random.randint(1, int(time.time()))
return datetime.date.fromtimestamp(d).strftime('%Y-%m-%d')
#task(2)
def get_all(self):
self.client.get(url=self.path)
#task(8)
def post_request(self):
self.client.post(url=self.path, json=self._generate_post_data(), headers=h)
class TeacherProcess(LoadTest):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.path = "/api/v1/teacher"
def _generate_post_data(self):
request_data = {
"teacherName": str,
"teacherEmail": str,
"teacherDOB": str
}
request_data["teacherName"] = self.generate_random_string()
request_data["teacherEmail"] = f"{self.generate_random_string()}#{self.generate_random_string()}.{self.generate_random_string()}"
request_data["teacherDOB"] = self.generate_random_dob()
return request_data
class StudentProcess(LoadTest):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.path = "/api/v1/student"
def _generate_post_data(self):
request_data = {
"studentName": str,
"studentEmail": str,
"studentDOB": str
}
request_data["studentName"] = self.generate_random_string()
request_data["studentEmail"] = f"{self.generate_random_string()}#{self.generate_random_string()}.{self.generate_random_string()}"
request_data["studentDOB"] = self.generate_random_dob()
return request_data
I don't know how but somehow I am able to use _generate_post_data inside LoadTest class. I am telling it's working because locust output as below:
Type Name # reqs # fails | Avg Min Max Med | req/s failures/s
--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
GET /api/v1/student 126 0(0.00%) | 36 2 117 7 | 12.78 0.00
POST /api/v1/student 499 0(0.00%) | 66 1 276 6 | 50.61 0.00
GET /api/v1/teacher 135 0(0.00%) | 53 2 233 8 | 13.69 0.00
POST /api/v1/teacher 502 0(0.00%) | 60 2 238 6 | 50.92 0.00
--------|----------------------------------------------------------------------------|-------|-------------|-------|-------|-------|-------|--------|-----------
Aggregated 1262 0(0.00%) | 59 1 276 7 | 128.00 0.00
Response time percentiles (approximated)
Type Name 50% 66% 75% 80% 90% 95% 98% 99% 99.9% 99.99% 100% # reqs
--------|--------------------------------------------------------------------------------|--------|------|------|------|------|------|------|------|------|------|------|------
GET /api/v1/student 7 10 110 110 110 110 120 120 120 120 120 126
POST /api/v1/student 6 10 200 220 250 260 260 270 280 280 280 499
GET /api/v1/teacher 8 10 110 140 230 230 230 230 230 230 230 135
POST /api/v1/teacher 6 9 180 200 220 230 230 230 240 240 240 502
--------|--------------------------------------------------------------------------------|--------|------|------|------|------|------|------|------|------|------|------|------
Aggregated 7 10 110 190 230 240 260 260 270 280 280 1262
As you can see there is no failure. My question is how I am able to access _generate_post_data while I am inside the LoadTest class? The second one is related to below error:
[2023-01-25 12:31:38,147] pop-os/ERROR/locust.user.task: 'LoadTest' object has no attribute '_generate_post_data'
Traceback (most recent call last):
File "/home/ak/.local/lib/python3.10/site-packages/locust/user/task.py", line 347, in run
self.execute_next_task()
File "/home/ak/.local/lib/python3.10/site-packages/locust/user/task.py", line 372, in execute_next_task
self.execute_task(self._task_queue.pop(0))
File "/home/ak/.local/lib/python3.10/site-packages/locust/user/task.py", line 493, in execute_task
task(self.user)
File "/home/ak/Desktop/my-projects/spring-boot-app/performans-testing/locust.py", line 38, in post_request
self.client.post(url=self.path, json=self._generate_post_data(), headers=h)
AttributeError: 'LoadTest' object has no attribute '_generate_post_data'
I am quite confused, is that a locust bug, or am I doing something wrong? While locust doesn't show any failure, while I am having this error. If anyone can explain that, I would be appreciated
You need to inform Locust not to instantiate users based on the parent LoadTest class.
class LoadTest(HttpUser):
abstract = True
...
Related
I am working with an external data source and I am trying to get Quaterstart(QS) frequency for a particular data field. I am providing a dummy data and code below.
import pandas as pd
df = pd.DataFrame(data=[['2022-01-01', '2021-01-03', 'a'], ['2020-05-01', '2021-03-03', 'b'],
['2023-06-02', '2019-04-03', 'c']], columns=['open_dt', 'd2', 'x'])
df['open_dt'] = df['open_dt'].astype('datetime64[ns]')
df['quater_open_dt'] = df['open_dt'].dt.to_period('QS')
I am gettinng the following error when I run this
---> 7 df['quater_open_dt'] = df['open_dt'].dt.to_period('QS')
8 df['establish_date'] = df['establish_date'].astype('datetime64[ns]')
9 df_tin = grp_on_tin(df)
/opt/miniconda3/envs/PY37/lib/python3.7/site-packages/pandas/core/accessor.py in f(self, *args, **kwargs)
90 def _create_delegator_method(name):
91 def f(self, *args, **kwargs):
---> 92 return self._delegate_method(name, *args, **kwargs)
93
94 f.__name__ = name
/opt/miniconda3/envs/PY37/lib/python3.7/site-packages/pandas/core/indexes/accessors.py in _delegate_method(self, name, *args, **kwargs)
107
108 method = getattr(values, name)
--> 109 result = method(*args, **kwargs)
110
111 if not is_list_like(result):
/opt/miniconda3/envs/PY37/lib/python3.7/site-packages/pandas/core/indexes/extension.py in method(self, *args, **kwargs)
81
82 def method(self, *args, **kwargs):
---> 83 result = attr(self._data, *args, **kwargs)
84 if wrap:
85 if isinstance(result, type(self._data)):
/opt/miniconda3/envs/PY37/lib/python3.7/site-packages/pandas/core/arrays/datetimes.py in to_period(self, freq)
1121 freq = res
1122
-> 1123 return PeriodArray._from_datetime64(self._data, freq, tz=self.tz)
1124
1125 def to_perioddelta(self, freq):
/opt/miniconda3/envs/PY37/lib/python3.7/site-packages/pandas/core/arrays/period.py in _from_datetime64(cls, data, freq, tz)
236 PeriodArray[freq]
237 """
--> 238 data, freq = dt64arr_to_periodarr(data, freq, tz)
239 return cls(data, freq=freq)
240
/opt/miniconda3/envs/PY37/lib/python3.7/site-packages/pandas/core/arrays/period.py in dt64arr_to_periodarr(data, freq, tz)
975 data = data._values
976
--> 977 base = freq._period_dtype_code
978 return c_dt64arr_to_periodarr(data.view("i8"), base, tz), freq
979
AttributeError: 'pandas._libs.tslibs.offsets.QuarterBegin' object has no attribute '_period_dtype_code'
Can someone please help me understand what's happening here?
ps: The data given here is dummy data and not the original data
Your syntax is incorrect, try like this instead:
df["quater_open_dt"] = df["open_dt"].dt.to_period("Q").dt.start_time
print(df)
# Output
open_dt d2 x quater_open_dt
0 2022-01-01 2021-01-03 a 2022-01-01
1 2020-05-01 2021-03-03 b 2020-04-01
2 2023-06-02 2019-04-03 c 2023-04-01
So i have folder called "data" say containing many CSV files
import dask.dataframe as dd
df = dd.read_csv('data/*.csv')
df.head()
df.column_1.mean().compute()
The above lines of code work perfectly and the dask.compute method does its job. But when i add the "include_path_column=True" parameter to the dd.read_cs() function call, i get the following error:
IndexError: list index out of range
When i expand the error i get
IndexError Traceback (most recent call last)
<ipython-input-129-4be67235bebb> in <module>
----> 1 df['H_hp'].mean().compute()
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
165 dask.base.compute
166 """
--> 167 (result,) = compute(self, traverse=False, **kwargs)
168 return result
169
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
444 )
445
--> 446 dsk = collections_to_dsk(collections, optimize_graph, **kwargs)
447 keys, postcomputes = [], []
448 for x in collections:
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/base.py in collections_to_dsk(collections, optimize_graph, **kwargs)
216 dsk, keys = _extract_graph_and_keys(val)
217 groups[opt] = (dsk, keys)
--> 218 _opt = opt(dsk, keys, **kwargs)
219 _opt_list.append(_opt)
220
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/dataframe/optimize.py in optimize(dsk, keys, **kwargs)
19 dsk = fuse_roots(dsk, keys=flat_keys)
20
---> 21 dsk = ensure_dict(dsk)
22
23 if isinstance(keys, list):
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/utils.py in ensure_dict(d)
1030 dd_id = id(dd)
1031 if dd_id not in seen:
-> 1032 result.update(dd)
1033 seen.add(dd_id)
1034 return result
/anaconda/envs/azureml_py36/lib/python3.6/site-packages/dask/dataframe/io/csv.py in __getitem__(self, key)
80
81 if self.paths is not None:
---> 82 path_info = (self.colname, self.paths[i], self.paths)
83 else:
84 path_info = None
IndexError: list index out of range
I'm trying to profile an excel file, it is a very small data set, only 30 columns and 535 rows, but when I run the profile_report function it stops each time in a different percentage but always has the same message:
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-41-283dd2cb2000> in <module>
1 df=pd.read_excel(path_working+'Documents/Information/'+'sample.xlsx')
2 profile = df.profile_report(title='Sample Exploratory')
----> 3 profile.to_file(path_working+'sample.html')
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_file(self, output_file, silent)
276 create_html_assets(output_file)
277
--> 278 data = self.to_html()
279
280 if output_file.suffix != ".html":
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in to_html(self)
384
385 """
--> 386 return self.html
387
388 def to_json(self) -> str:
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in html(self)
199 def html(self):
200 if self._html is None:
--> 201 self._html = self._render_html()
202 return self._html
203
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in _render_html(self)
306 from pandas_profiling.report.presentation.flavours import HTMLReport
307
--> 308 report = self.report
309
310 disable_progress_bar = not config["progress_bar"].get(bool)
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in report(self)
193 def report(self):
194 if self._report is None:
--> 195 self._report = get_report_structure(self.description_set)
196 return self._report
197
~\anaconda3\lib\site-packages\pandas_profiling\profile_report.py in description_set(self)
172 def description_set(self):
173 if self._description_set is None:
--> 174 self._description_set = describe_df(
175 self.title, self.df, self.summarizer, self.typeset, self._sample
176 )
~\anaconda3\lib\site-packages\pandas_profiling\model\describe.py in describe(title, df, summarizer, typeset, sample)
72 total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
73 ) as pbar:
---> 74 series_description = get_series_descriptions(df, summarizer, typeset, pbar)
75
76 pbar.set_postfix_str("Get variable types")
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in get_series_descriptions(df, summarizer, typeset, pbar)
97 # TODO: use `Pool` for Linux-based systems
98 with multiprocessing.pool.ThreadPool(pool_size) as executor:
---> 99 for i, (column, description) in enumerate(
100 executor.imap_unordered(multiprocess_1d, args)
101 ):
~\anaconda3\lib\multiprocessing\pool.py in next(self, timeout)
866 if success:
867 return value
--> 868 raise value
869
870 __next__ = next # XXX
~\anaconda3\lib\multiprocessing\pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
123 job, i, func, args, kwds = task
124 try:
--> 125 result = (True, func(*args, **kwds))
126 except Exception as e:
127 if wrap_exception and func is not _helper_reraises_exception:
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in multiprocess_1d(args)
76 """
77 column, series = args
---> 78 return column, describe_1d(series, summarizer, typeset)
79
80 pool_size = config["pool_size"].get(int)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary.py in describe_1d(series, summarizer, typeset)
50 vtype = typeset.detect_type(series)
51
---> 52 return summarizer.summarize(series, dtype=vtype)
53
54
~\anaconda3\lib\site-packages\pandas_profiling\model\summarizer.py in summarize(self, series, dtype)
54 """
55 summarizer_func = compose(self.summary_map.get(dtype, []))
---> 56 _, summary = summarizer_func(series, {"type": dtype})
57 return summary
58
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
21 return f(*x)
22 else:
---> 23 return f(*res)
24
25 return func2
~\anaconda3\lib\site-packages\pandas_profiling\model\handler.py in func2(*x)
17 def func(f, g):
18 def func2(*x):
---> 19 res = g(*x)
20 if type(res) == bool:
21 return f(*x)
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in inner(series, summary)
70 if not summary["hashable"]:
71 return series, summary
---> 72 return fn(series, summary)
73
74 return inner
~\anaconda3\lib\site-packages\visions\utils\series_utils.py in inner(series, state, *args, **kwargs)
40 return False
41
---> 42 return fn(series, state, *args, **kwargs)
43
44 return inner
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_algorithms.py in describe_numeric_1d(series, summary)
208
209 if chi_squared_threshold > 0.0:
--> 210 stats["chi_squared"] = chi_square(finite_values)
211
212 stats["range"] = stats["max"] - stats["min"]
~\anaconda3\lib\site-packages\pandas_profiling\model\summary_helpers.py in chi_square(values, histogram)
352 def chi_square(values=None, histogram=None):
353 if histogram is None:
--> 354 histogram, _ = np.histogram(values, bins="auto")
355 return dict(chisquare(histogram)._asdict())
356
<__array_function__ internals> in histogram(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)
790 a, weights = _ravel_and_check_weights(a, weights)
791
--> 792 bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
793
794 # Histogram is an integer or a float array depending on the weights.
~\anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)
444
445 # bin edges must be computed
--> 446 bin_edges = np.linspace(
447 first_edge, last_edge, n_equal_bins + 1,
448 endpoint=True, dtype=bin_type)
<__array_function__ internals> in linspace(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\core\function_base.py in linspace(start, stop, num, endpoint, retstep, dtype, axis)
126
127 delta = stop - start
--> 128 y = _nx.arange(0, num, dtype=dt).reshape((-1,) + (1,) * ndim(delta))
129 # In-place multiplication y *= delta/div is faster, but prevents the multiplicant
130 # from overriding what class is produced, and thus prevents, e.g. use of Quantities,
MemoryError: Unable to allocate 1.75 EiB for an array with shape (251938683619878560,) and data type float64
I ran the same code in a different python installation and it ran fine.
Thank you all in advance and let me know if you need more information.
This is a bug in numpy.histogram (https://github.com/numpy/numpy/issues/10297), also reported on SO (Numpy histogram extremely slow on small data set).
This error is caused by the call to np.histogram(x, bin='auto'). When input has very large values, the "auto" method can fail while trying to generate an enormous number of bin that cannot fit in ram.
As a workaround, you can remove the large values manually before generating the report.
What's going on with my dictionary comprehension here?
I am parsing a BLAST file and want to create objects for each line in the file. Ideally each object will be stored in a dictionary for use later in the program.
Parsing works fine but I end up with a blank transSwiss dictionary.
Here are a few lines of output as an example:
c0_g1_i1|m.1 gi|74665200|sp|Q9HGP0.1|PVG4_SCHPO 100.00 372 0 0 1 372 1 372 0.0 754
c1000_g1_i1|m.799 gi|48474761|sp|O94288.1|NOC3_SCHPO 100.00 747 0 0 5 751 1 747 0.0 1506
c1001_g1_i1|m.800 gi|259016383|sp|O42919.3|RT26A_SCHPO 100.00 268 0 0 1 268 1 268 0.0 557
c1002_g1_i1|m.801 gi|1723464|sp|Q10302.1|YD49_SCHPO 100.00 646 0 0 1 646 1 646 0.0 1310
I'm trying to make each BLAST line a parse_blast object.
class parse_blast(object):
def __init__(self, line):
#Strip end-of-line and split on tabs
self.fields = line.strip("\n").split("\t")
self.transcriptId, self.isoform = self.fields[0].split("|")
self.swissStuff = self.fields[1].split("|")
self.swissProtId = self.swissStuff[3]
self.percentId = self.fields[2]
def filterblast(self):
return float(self.percentId) > 95
blastmap = map(parse_blast, blast_output.readlines())
filtered = filter(parse_blast.filterblast, blastmap)
transSwiss = {blastmap.transcriptId:blastmap for blastmap.transcriptId in filtered}
When you do this:
for blastmap.transcriptId in filtered
you're trying to assign each element of filtered to blastmap.transcriptId in sequence. blastmap is either a list or an instance of the map type, depending on your Python version, so it has no transcriptId attribute, and your code fails with an AttributeError.
Use a variable. A new variable:
transSwiss = {pb.transcriptId: pb for pb in filtered}
I am trying to convert a COO type sparse matrix (from Scipy.Sparse) to a Pandas sparse series. From the documentation(http://pandas.pydata.org/pandas-docs/stable/sparse.html) it says to use the command SparseSeries.from_coo(A). This seems to be OK, but when I try to see the series' attributes, this is what happens.
10x10 seems OK.
import pandas as pd
import scipy.sparse as ss
import numpy as np
row = (np.random.random(10)*10).astype(int)
col = (np.random.random(10)*10).astype(int)
val = np.random.random(10)*10
sparse = ss.coo_matrix((val,(row,col)),shape=(10,10))
pss = pd.SparseSeries.from_coo(sparse)
print pss
0 7 1.416631
9 5.833902
1 0 4.131919
2 3 2.820531
7 2.227009
3 1 9.205619
4 4 8.309077
6 0 4.376921
7 6 8.444013
7 7.383886
dtype: float64
BlockIndex
Block locations: array([0])
Block lengths: array([10])
But not 100x100.
import pandas as pd
import scipy.sparse as ss
import numpy as np
row = (np.random.random(100)*100).astype(int)
col = (np.random.random(100)*100).astype(int)
val = np.random.random(100)*100
sparse = ss.coo_matrix((val,(row,col)),shape=(100,100))
pss = pd.SparseSeries.from_coo(sparse)
print pss
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-790-f0c22a601b93> in <module>()
7 sparse = ss.coo_matrix((val,(row,col)),shape=(100,100))
8 pss = pd.SparseSeries.from_coo(sparse)
----> 9 print pss
10
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\base.pyc in __str__(self)
45 if compat.PY3:
46 return self.__unicode__()
---> 47 return self.__bytes__()
48
49 def __bytes__(self):
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\base.pyc in __bytes__(self)
57
58 encoding = get_option("display.encoding")
---> 59 return self.__unicode__().encode(encoding, 'replace')
60
61 def __repr__(self):
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\sparse\series.pyc in __unicode__(self)
287 def __unicode__(self):
288 # currently, unicode is same as repr...fixes infinite loop
--> 289 series_rep = Series.__unicode__(self)
290 rep = '%s\n%s' % (series_rep, repr(self.sp_index))
291 return rep
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\series.pyc in __unicode__(self)
895
896 self.to_string(buf=buf, name=self.name, dtype=self.dtype,
--> 897 max_rows=max_rows)
898 result = buf.getvalue()
899
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\series.pyc in to_string(self, buf, na_rep, float_format, header, length, dtype, name, max_rows)
960 the_repr = self._get_repr(float_format=float_format, na_rep=na_rep,
961 header=header, length=length, dtype=dtype,
--> 962 name=name, max_rows=max_rows)
963
964 # catch contract violations
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\series.pyc in _get_repr(self, name, header, length, dtype, na_rep, float_format, max_rows)
989 na_rep=na_rep,
990 float_format=float_format,
--> 991 max_rows=max_rows)
992 result = formatter.to_string()
993
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\format.pyc in __init__(self, series, buf, length, header, na_rep, name, float_format, dtype, max_rows)
145 self.dtype = dtype
146
--> 147 self._chk_truncate()
148
149 def _chk_truncate(self):
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\core\format.pyc in _chk_truncate(self)
158 else:
159 row_num = max_rows // 2
--> 160 series = concat((series.iloc[:row_num], series.iloc[-row_num:]))
161 self.tr_row_num = row_num
162 self.tr_series = series
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\tools\merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
752 keys=keys, levels=levels, names=names,
753 verify_integrity=verify_integrity,
--> 754 copy=copy)
755 return op.get_result()
756
C:\Users\ej\AppData\Local\Continuum\Anaconda\lib\site-packages\pandas\tools\merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
803 for obj in objs:
804 if not isinstance(obj, NDFrame):
--> 805 raise TypeError("cannot concatenate a non-NDFrame object")
806
807 # consolidate
TypeError: cannot concatenate a non-NDFrame object
I don't really understand the error message - I think I am following the example in the documentation to the letter, just using my own COO matrix (could it be the size?)
Regards
I have an older pandas. It has the sparse code, but not the tocoo.
The pandas issue that has been filed in connection with this is:
https://github.com/pydata/pandas/issues/10818
But I found on github that:
def _coo_to_sparse_series(A, dense_index=False):
""" Convert a scipy.sparse.coo_matrix to a SparseSeries.
Use the defaults given in the SparseSeries constructor. """
s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
s = s.sort_index()
s = s.to_sparse() # TODO: specify kind?
# ...
return s
With a smallish sparse matrix I construct and display without problems:
In [259]: Asml=sparse.coo_matrix(np.arange(10*5).reshape(10,5))
In [260]: s=pd.Series(Asml.data,pd.MultiIndex.from_arrays((Asml.row,Asml.col)))
In [261]: s=s.sort_index()
In [262]: s
Out[262]:
0 1 1
2 2
3 3
4 4
1 0 5
1 6
2 7
[... mine]
3 48
4 49
dtype: int32
In [263]: ssml=s.to_sparse()
In [264]: ssml
Out[264]:
0 1 1
2 2
3 3
4 4
1 0 5
[... mine]
2 47
3 48
4 49
dtype: int32
BlockIndex
Block locations: array([0])
Block lengths: array([49])
but with a larger array (more nonzero elements) I get a display error. I'm guessing it happens when the display for the (plain) series starts to use an ellipsis (...). I'm running in Py3, so I get a different error message.
....\pandas\core\base.pyc in __str__(self)
45 if compat.PY3:
46 return self.__unicode__() # py3
47 return self.__bytes__() # py2 route
e.g.:
In [265]: Asml=sparse.coo_matrix(np.arange(10*7).reshape(10,7))
In [266]: s=pd.Series(Asml.data,pd.MultiIndex.from_arrays((Asml.row,Asml.col)))
In [267]: s=s.sort_index()
In [268]: s
Out[268]:
0 1 1
2 2
3 3
4 4
5 5
6 6
1 0 7
1 8
2 9
3 10
4 11
5 12
6 13
2 0 14
1 15
...
7 6 55
8 0 56
1 57
[... mine]
Length: 69, dtype: int32
In [269]: ssml=s.to_sparse()
In [270]: ssml
Out[270]: <repr(<pandas.sparse.series.SparseSeries at 0xaff6bc0c>)
failed: AttributeError: 'SparseArray' object has no attribute '_get_repr'>
I'm not sufficiently familiar with pandas code and structures to deduce much more for now.