diff --git a/pandas/core/format.py b/pandas/core/format.py index 6d0b0596d08d2..43eb0e890aa62 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1024,9 +1024,8 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, # preallocate data 2d list self.blocks = self.obj._data.blocks - ncols = sum(len(b.items) for b in self.blocks) + ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols - self.column_map = self.obj._data.get_items_map(use_cached=False) if chunksize is None: chunksize = (100000 / (len(self.cols) or 1)) or 1 @@ -1293,10 +1292,9 @@ def _save_chunk(self, start_i, end_i): float_format=self.float_format, date_format=self.date_format) - for i, item in enumerate(b.items): - + for col_loc, col in zip(b.mgr_locs, d): # self.data is a preallocated list - self.data[self.column_map[b][i]] = d[i] + self.data[col_loc] = col ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep, float_format=self.float_format, diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 23736dafe3556..fcd2e65afddcb 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1043,9 +1043,11 @@ def to_panel(self): new_blocks = [] for block in selfsorted._data.blocks: - newb = block2d_to_blocknd(block.values.T, block.items, shape, - [major_labels, minor_labels], - ref_items=selfsorted.columns) + newb = block2d_to_blocknd( + values=block.values.T, + placement=block.mgr_locs, shape=shape, + labels=[major_labels, minor_labels], + ref_items=selfsorted.columns) new_blocks.append(newb) # preserve names, if any @@ -1934,7 +1936,9 @@ def _ensure_valid_index(self, value): raise ValueError('Cannot set a frame with no defined index ' 'and a value that cannot be converted to a ' 'Series') - self._data.set_axis(1, value.index.copy(), check_axis=False) + + self._data = self._data.reindex_axis(value.index.copy(), axis=1, + fill_value=np.nan) # we are a scalar # noop @@ -2039,7 +2043,11 @@ def _sanitize_column(self, key, value): @property def _series(self): - return self._data.get_series_dict() + result = {} + for idx, item in enumerate(self.columns): + result[item] = Series(self._data.iget(idx), index=self.index, + name=item) + return result def lookup(self, row_labels, col_labels): """Label-based "fancy indexing" function for DataFrame. @@ -2629,16 +2637,14 @@ def trans(v): indexer = _nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=bm_axis, + convert=False, verify=False) + if inplace: - if axis == 1: - new_data = self._data.reindex_items( - self._data.items[indexer], - copy=False) - elif axis == 0: - new_data = self._data.take(indexer) - self._update_inplace(new_data) + return self._update_inplace(new_data) else: - return self.take(indexer, axis=axis, convert=False, is_copy=False) + return self._constructor(new_data).__finalize__(self) def sortlevel(self, level=0, axis=0, ascending=True, inplace=False): """ @@ -2673,16 +2679,13 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False): else: return self.take(indexer, axis=axis, convert=False) + bm_axis = self._get_block_manager_axis(axis) + new_data = self._data.take(indexer, axis=bm_axis, + convert=False, verify=False) if inplace: - if axis == 1: - new_data = self._data.reindex_items( - self._data.items[indexer], - copy=False) - elif axis == 0: - new_data = self._data.take(indexer) - self._update_inplace(new_data) + return self._update_inplace(new_data) else: - return self.take(indexer, axis=axis, convert=False, is_copy=False) + return self._constructor(new_data).__finalize__(self) def swaplevel(self, i, j, axis=0): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d894289c87eee..3f2ecd8afd2d4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -565,7 +565,7 @@ def f(x): f = _get_rename_function(v) baxis = self._get_block_manager_axis(axis) - result._data = result._data.rename(f, axis=baxis, copy=copy) + result._data = result._data.rename_axis(f, axis=baxis, copy=copy) result._clear_item_cache() if inplace: @@ -1217,21 +1217,9 @@ def take(self, indices, axis=0, convert=True, is_copy=True): taken : type of caller """ - # check/convert indicies here - if convert: - axis = self._get_axis_number(axis) - indices = _maybe_convert_indices( - indices, len(self._get_axis(axis))) - - baxis = self._get_block_manager_axis(axis) - if baxis == 0: - labels = self._get_axis(axis) - new_items = labels.take(indices) - new_data = self._data.reindex_axis(new_items, indexer=indices, - axis=baxis) - else: - new_data = self._data.take(indices, axis=baxis) - + new_data = self._data.take(indices, + axis=self._get_block_manager_axis(axis), + convert=True, verify=True) result = self._constructor(new_data).__finalize__(self) # maybe set copy if we didn't actually change the index @@ -1701,7 +1689,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, labels, method, level, limit=limit, copy_if_needed=True) return self._reindex_with_indexers( {axis: [new_index, indexer]}, method=method, fill_value=fill_value, - limit=limit, copy=copy).__finalize__(self) + limit=limit, copy=copy) def _reindex_with_indexers(self, reindexers, method=None, fill_value=np.nan, limit=None, copy=False, @@ -1716,30 +1704,16 @@ def _reindex_with_indexers(self, reindexers, method=None, if index is None: continue - index = _ensure_index(index) - # reindex the axis - if method is not None: - new_data = new_data.reindex_axis( - index, indexer=indexer, method=method, axis=baxis, - fill_value=fill_value, limit=limit, copy=copy) - - elif indexer is not None: - # TODO: speed up on homogeneous DataFrame objects + index = _ensure_index(index) + if indexer is not None: indexer = com._ensure_int64(indexer) - new_data = new_data.reindex_indexer(index, indexer, axis=baxis, - fill_value=fill_value, - allow_dups=allow_dups) - - elif (baxis == 0 and index is not None and - index is not new_data.axes[baxis]): - new_data = new_data.reindex_items(index, copy=copy, - fill_value=fill_value) - - elif (baxis > 0 and index is not None and - index is not new_data.axes[baxis]): - new_data = new_data.copy(deep=copy) - new_data.set_axis(baxis, index) + + # TODO: speed up on homogeneous DataFrame objects + new_data = new_data.reindex_indexer(index, indexer, axis=baxis, + fill_value=fill_value, + allow_dups=allow_dups, + copy=copy) if copy and new_data is self._data: new_data = new_data.copy() diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c0222ad248e0c..f650b41ff12be 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2196,10 +2196,10 @@ def _iterate_slices(self): yield val, slicer(val) def _cython_agg_general(self, how, numeric_only=True): - new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) - return self._wrap_agged_blocks(new_blocks) + new_items, new_blocks = self._cython_agg_blocks(how, numeric_only=numeric_only) + return self._wrap_agged_blocks(new_items, new_blocks) - def _wrap_agged_blocks(self, blocks): + def _wrap_agged_blocks(self, items, blocks): obj = self._obj_with_exclusions new_axes = list(obj._data.axes) @@ -2210,6 +2210,10 @@ def _wrap_agged_blocks(self, blocks): else: new_axes[self.axis] = self.grouper.result_index + # Make sure block manager integrity check passes. + assert new_axes[0].equals(items) + new_axes[0] = items + mgr = BlockManager(blocks, new_axes) new_obj = type(obj)(mgr) @@ -2223,14 +2227,14 @@ def _cython_agg_blocks(self, how, numeric_only=True): new_blocks = [] + if numeric_only: + data = data.get_numeric_data(copy=False) + for block in data.blocks: values = block.values is_numeric = is_numeric_dtype(values.dtype) - if numeric_only and not is_numeric: - continue - if is_numeric: values = com.ensure_float(values) @@ -2239,13 +2243,13 @@ def _cython_agg_blocks(self, how, numeric_only=True): # see if we can cast the block back to the original dtype result = block._try_cast_result(result) - newb = make_block(result, block.items, block.ref_items) + newb = make_block(result, placement=block.mgr_locs) new_blocks.append(newb) if len(new_blocks) == 0: raise DataError('No numeric types to aggregate') - return new_blocks + return data.items, new_blocks def _get_data_to_aggregate(self): obj = self._obj_with_exclusions @@ -2837,28 +2841,10 @@ def _wrap_aggregated_output(self, output, names=None): return result.convert_objects() - def _wrap_agged_blocks(self, blocks): - obj = self._obj_with_exclusions - - if self.axis == 0: - agg_labels = obj.columns - else: - agg_labels = obj.index - - if sum(len(x.items) for x in blocks) == len(agg_labels): - output_keys = agg_labels - else: - all_items = [] - for b in blocks: - all_items.extend(b.items) - output_keys = agg_labels[agg_labels.isin(all_items)] - - for blk in blocks: - blk.set_ref_items(output_keys, maybe_rename=False) - + def _wrap_agged_blocks(self, items, blocks): if not self.as_index: index = np.arange(blocks[0].values.shape[1]) - mgr = BlockManager(blocks, [output_keys, index]) + mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) group_levels = self.grouper.get_group_levels() @@ -2869,7 +2855,7 @@ def _wrap_agged_blocks(self, blocks): result = result.consolidate() else: index = self.grouper.result_index - mgr = BlockManager(blocks, [output_keys, index]) + mgr = BlockManager(blocks, [items, index]) result = DataFrame(mgr) if self.axis == 1: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 792a310c8a554..7465fad39496c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,30 +1,39 @@ +import copy import itertools import re import operator from datetime import datetime, timedelta -import copy -from collections import defaultdict +from collections import defaultdict, deque import numpy as np from pandas.core.base import PandasObject +from pandas.hashtable import Factorizer from pandas.core.common import (_possibly_downcast_to_dtype, isnull, notnull, _NS_DTYPE, _TD_DTYPE, ABCSeries, is_list_like, ABCSparseSeries, _infer_dtype_from_scalar, - _values_from_object, _is_null_datelike_scalar) -from pandas.core.index import Index, MultiIndex, _ensure_index + _is_null_datelike_scalar, + is_timedelta64_dtype, is_datetime64_dtype,) +from pandas.core.index import Index, Int64Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib import pandas.computation.expressions as expressions +from pandas.util.decorators import cache_readonly from pandas.tslib import Timestamp from pandas import compat -from pandas.compat import range, lrange, lmap, callable, map, zip, u +from pandas.compat import (range, lrange, lmap, callable, map, zip, u, + OrderedDict) from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + +from pandas.lib import BlockPlacement + + class Block(PandasObject): """ @@ -33,7 +42,7 @@ class Block(PandasObject): Index-ignorant; let the container take care of that """ - __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim'] + __slots__ = ['_mgr_locs', 'values', 'ndim'] is_numeric = False is_float = False is_integer = False @@ -49,29 +58,20 @@ class Block(PandasObject): _verify_integrity = True _ftype = 'dense' - def __init__(self, values, items, ref_items, ndim=None, fastpath=False, - placement=None): - + def __init__(self, values, placement, ndim=None, fastpath=False): if ndim is None: ndim = values.ndim - - if values.ndim != ndim: + elif values.ndim != ndim: raise ValueError('Wrong number of dimensions') + self.ndim = ndim - if len(items) != len(values): - raise ValueError('Wrong number of items passed %d, index implies ' - '%d' % (len(values), len(items))) - - self.set_ref_locs(placement) + self.mgr_locs = placement self.values = values - self.ndim = ndim - if fastpath: - self.items = items - self.ref_items = ref_items - else: - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) + if len(self.mgr_locs) != len(self.values): + raise ValueError('Wrong number of items passed %d,' + ' placement implies %d' % ( + len(self.values), len(self.mgr_locs))) @property def _consolidate_key(self): @@ -91,79 +91,28 @@ def fill_value(self): return np.nan @property - def ref_locs(self): - if self._ref_locs is None: - # we have a single block, maybe have duplicates - # but indexer is easy - # also if we are not really reindexing, just numbering - if self._is_single_block or self.ref_items.equals(self.items): - indexer = np.arange(len(self.items)) - else: - - indexer = self.ref_items.get_indexer(self.items) - indexer = com._ensure_platform_int(indexer) - if (indexer == -1).any(): + def mgr_locs(self): + return self._mgr_locs - # this means that we have nan's in our block - try: - indexer[indexer == -1] = np.arange( - len(self.items))[isnull(self.items)] - except: - raise AssertionError('Some block items were not in ' - 'block ref_items') - - self._ref_locs = indexer - return self._ref_locs - - def take_ref_locs(self, indexer): - """ - need to preserve the ref_locs and just shift them - return None if ref_locs is None - - see GH6509 + def make_block_same_class(self, values, placement, copy=False, + **kwargs): """ + Wrap given values in a block of same type as self. - ref_locs = self._ref_locs - if ref_locs is None: - return None - - tindexer = np.ones(len(ref_locs),dtype=bool) - tindexer[indexer] = False - tindexer = tindexer.astype(int).cumsum()[indexer] - ref_locs = ref_locs[indexer] - - # Make sure the result is a copy, or otherwise self._ref_locs will be - # updated. - if ref_locs.base is not None: - ref_locs = ref_locs.copy() - - ref_locs -= tindexer - return ref_locs + `kwargs` are used in SparseBlock override. - def reset_ref_locs(self): - """ reset the block ref_locs """ - self._ref_locs = np.empty(len(self.items), dtype='int64') - - def set_ref_locs(self, placement): - """ explicity set the ref_locs indexer, only necessary for duplicate - indicies """ - if placement is None: - self._ref_locs = None - else: - self._ref_locs = np.array(placement, dtype='int64', copy=True) + if copy: + values = values.copy() + return make_block(values, placement, klass=self.__class__, + fastpath=True) - def set_ref_items(self, ref_items, maybe_rename=True): - """ - If maybe_rename=True, need to set the items for this guy - """ - if not isinstance(ref_items, Index): - raise AssertionError('block ref_items must be an Index') - if maybe_rename == 'clear': - self._ref_locs = None - elif maybe_rename: - self.items = ref_items.take(self.ref_locs) - self.ref_items = ref_items + @mgr_locs.setter + def mgr_locs(self, new_mgr_locs): + if not isinstance(new_mgr_locs, BlockPlacement): + new_mgr_locs = BlockPlacement(new_mgr_locs) + + self._mgr_locs = new_mgr_locs def __unicode__(self): @@ -178,32 +127,47 @@ def __unicode__(self): shape = ' x '.join([com.pprint_thing(s) for s in self.shape]) result = '%s: %s, %s, dtype: %s' % ( - name, com.pprint_thing(self.items), shape, self.dtype) + name, com.pprint_thing(self.mgr_locs.indexer), shape, + self.dtype) return result - def __contains__(self, item): - return item in self.items - def __len__(self): return len(self.values) def __getstate__(self): - # should not pickle generally (want to share ref_items), but here for - # completeness - return (self.items, self.ref_items, self.values) + return self.mgr_locs.indexer, self.values def __setstate__(self, state): - items, ref_items, values = state - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) - self.values = values - self.ndim = values.ndim + self.mgr_locs = BlockPlacement(state[0]) + self.values = state[1] + self.ndim = self.values.ndim def _slice(self, slicer): """ return a slice of my values """ return self.values[slicer] + def getitem_block(self, slicer, new_mgr_locs=None): + """ + Perform __getitem__-like, return result as block. + + As of now, only supports slices that preserve dimensionality. + + """ + if new_mgr_locs is None: + if isinstance(slicer, tuple): + axis0_slicer = slicer[0] + else: + axis0_slicer = slicer + new_mgr_locs = self.mgr_locs[axis0_slicer] + + new_values = self._slice(slicer) + + if new_values.ndim != self.ndim: + raise ValueError("Only same dim slicing is allowed") + + return self.make_block_same_class(new_values, new_mgr_locs) + @property def shape(self): return self.values.shape @@ -220,22 +184,8 @@ def dtype(self): def ftype(self): return "%s:%s" % (self.dtype, self._ftype) - def as_block(self, result): - """ if we are not a block, then wrap as a block, must have compatible shape """ - if not isinstance(result, Block): - result = make_block(result, - self.items, - self.ref_items) - return result - def merge(self, other): - if not self.ref_items.equals(other.ref_items): - raise AssertionError('Merge operands must have same ref_items') - - # Not sure whether to allow this or not - # if not union_ref.equals(other.ref_items): - # union_ref = self.ref_items + other.ref_items - return _merge_blocks([self, other], self.ref_items) + return _merge_blocks([self, other]) def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): @@ -249,62 +199,9 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, new_values = com.take_nd(self.values, indexer, axis, fill_value=fill_value, mask_info=mask_info) - return make_block(new_values, self.items, self.ref_items, + return make_block(new_values, ndim=self.ndim, fastpath=True, - placement=self._ref_locs) - - def reindex_items_from(self, new_ref_items, indexer=None, method=None, - fill_value=None, limit=None, copy=True): - """ - Reindex to only those items contained in the input set of items - - E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'], - then the resulting items will be ['b'] - - Returns - ------- - reindexed : Block - """ - if indexer is None: - new_ref_items, indexer = self.items.reindex(new_ref_items, - limit=limit) - - needs_fill = method is not None - if fill_value is None: - fill_value = self.fill_value - - new_items = new_ref_items - if indexer is None: - new_values = self.values.copy() if copy else self.values - - else: - - # single block reindex, filling is already happending - if self.ndim == 1: - new_values = com.take_1d(self.values, indexer, - fill_value=fill_value) - block = make_block(new_values, new_items, new_ref_items, - ndim=self.ndim, fastpath=True) - return block - else: - - masked_idx = indexer[indexer != -1] - new_items = self.items.take(masked_idx) - new_values = com.take_nd(self.values, masked_idx, axis=0, - allow_fill=False) - # fill if needed - if needs_fill: - new_values = com.interpolate_2d(new_values, method=method, - limit=limit, fill_value=fill_value) - - block = make_block(new_values, new_items, new_ref_items, - ndim=self.ndim, fastpath=True) - - # down cast if needed - if not self.is_float and (needs_fill or notnull(fill_value)): - block = block.downcast() - - return block + placement=self.mgr_locs) def get(self, item): loc = self.items.get_loc(item) @@ -313,7 +210,7 @@ def get(self, item): def iget(self, i): return self.values[i] - def set(self, item, value, check=False): + def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -321,50 +218,22 @@ def set(self, item, value, check=False): ------- None """ - loc = self.items.get_loc(item) - self.values[loc] = value - - def delete(self, item): - """ - Returns - ------- - y : Block (new object) - """ - loc = self.items.get_loc(item) - new_items = self.items.delete(loc) - new_values = np.delete(self.values, loc, 0) - return make_block(new_values, new_items, self.ref_items, - ndim=self.ndim, klass=self.__class__, fastpath=True) + self.values[locs] = values - def split_block_at(self, item): + def delete(self, loc): """ - Split block into zero or more blocks around columns with given label, - for "deleting" a column without having to copy data by returning views - on the original array. - - Returns - ------- - generator of Block + Delete given loc(-s) from block in-place. """ - loc = self.items.get_loc(item) - - if type(loc) == slice or type(loc) == int: - mask = [True] * len(self) - mask[loc] = False - else: # already a mask, inverted - mask = -loc - - for s, e in com.split_ranges(mask): - yield make_block(self.values[s:e], - self.items[s:e].copy(), - self.ref_items, - ndim=self.ndim, - klass=self.__class__, - fastpath=True) + self.values = np.delete(self.values, loc, 0) + self.mgr_locs = self.mgr_locs.delete(loc) def apply(self, func, **kwargs): """ apply the function to my values; return a block if we are not one """ - return self.as_block(func(self.values)) + result = func(self.values) + if not isinstance(result, Block): + result = make_block(values=result, placement=self.mgr_locs,) + + return result def fillna(self, value, limit=None, inplace=False, downcast=None): if not self._can_hold_na: @@ -415,8 +284,8 @@ def downcast(self, dtypes=None): dtypes = 'infer' nv = _possibly_downcast_to_dtype(values, dtypes) - return [make_block(nv, self.items, self.ref_items, ndim=self.ndim, - fastpath=True)] + return [make_block(nv, ndim=self.ndim, + fastpath=True, placement=self.mgr_locs)] # ndim > 1 if dtypes is None: @@ -429,11 +298,12 @@ def downcast(self, dtypes=None): # item-by-item # this is expensive as it splits the blocks items-by-item blocks = [] - for i, item in enumerate(self.items): + for i, rl in enumerate(self.mgr_locs): if dtypes == 'infer': dtype = 'infer' else: + raise AssertionError("dtypes as dict is not supported yet") dtype = dtypes.get(item, self._downcast_dtype) if dtype is None: @@ -442,8 +312,9 @@ def downcast(self, dtypes=None): nv = _possibly_downcast_to_dtype(values[i], dtype) nv = _block_shape(nv, ndim=self.ndim) - blocks.append(make_block(nv, Index([item]), self.ref_items, - ndim=self.ndim, fastpath=True)) + blocks.append(make_block(nv, + ndim=self.ndim, fastpath=True, + placement=[rl])) return blocks @@ -466,9 +337,11 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, try: # force the copy here if values is None: - values = com._astype_nansafe(self.values, dtype, copy=True) - newb = make_block(values, self.items, self.ref_items, - ndim=self.ndim, placement=self._ref_locs, + # _astype_nansafe works fine with 1-d only + values = com._astype_nansafe(self.values.ravel(), dtype, copy=True) + values = values.reshape(self.values.shape) + newb = make_block(values, + ndim=self.ndim, placement=self.mgr_locs, fastpath=True, dtype=dtype, klass=klass) except: if raise_on_error is True: @@ -482,7 +355,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, "(%s [%s])" % (copy, self.dtype.name, self.itemsize, newb.dtype.name, newb.itemsize)) - return [newb] + return newb def convert(self, copy=True, **kwargs): """ attempt to coerce any object types to better types @@ -491,31 +364,6 @@ def convert(self, copy=True, **kwargs): return [self.copy()] if copy else [self] - def prepare_for_merge(self, **kwargs): - """ a regular block is ok to merge as is """ - return self - - def post_merge(self, items, **kwargs): - """ we are non-sparse block, try to convert to a sparse block(s) """ - overlap = set(items.keys()) & set(self.items) - if len(overlap): - overlap = _ensure_index(overlap) - - new_blocks = [] - for item in overlap: - dtypes = set(items[item]) - - # this is a safe bet with multiple dtypes - dtype = list(dtypes)[0] if len(dtypes) == 1 else np.float64 - - b = make_block(SparseArray(self.get(item), dtype=dtype), - [item], self.ref_items) - new_blocks.append(b) - - return new_blocks - - return self - def _can_hold_element(self, value): raise NotImplementedError() @@ -581,15 +429,13 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): return values.tolist() # block actions #### - def copy(self, deep=True, ref_items=None): + def copy(self, deep=True): values = self.values if deep: values = values.copy() - if ref_items is None: - ref_items = self.ref_items - return make_block(values, self.items, ref_items, ndim=self.ndim, + return make_block(values, ndim=self.ndim, klass=self.__class__, fastpath=True, - placement=self._ref_locs) + placement=self.mgr_locs) def replace(self, to_replace, value, inplace=False, filter=None, regex=False): @@ -599,9 +445,8 @@ def replace(self, to_replace, value, inplace=False, filter=None, compatibility.""" mask = com.mask_missing(self.values, to_replace) if filter is not None: - for i, item in enumerate(self.items): - if item not in filter: - mask[i] = False + filtered_out = ~self.mgr_locs.isin(filter) + mask[filtered_out.nonzero()[0]] = False if not mask.any(): if inplace: @@ -672,8 +517,8 @@ def setitem(self, indexer, value): dtype = 'infer' values = self._try_coerce_result(values) values = self._try_cast_result(values, dtype) - return [make_block(transf(values), self.items, self.ref_items, - ndim=self.ndim, placement=self._ref_locs, + return [make_block(transf(values), + ndim=self.ndim, placement=self.mgr_locs, fastpath=True)] except (ValueError, TypeError) as detail: raise @@ -704,21 +549,11 @@ def putmask(self, mask, new, align=True, inplace=False): # may need to align the new if hasattr(new, 'reindex_axis'): - if align: - axis = getattr(new, '_info_axis_number', 0) - new = new.reindex_axis(self.items, axis=axis, - copy=False).values.T - else: - new = new.values.T + new = new.values.T # may need to align the mask if hasattr(mask, 'reindex_axis'): - if align: - axis = getattr(mask, '_info_axis_number', 0) - mask = mask.reindex_axis( - self.items, axis=axis, copy=False).values.T - else: - mask = mask.values.T + mask = mask.values.T # if we are passed a scalar None, convert it here if not is_list_like(new) and isnull(new): @@ -738,45 +573,8 @@ def putmask(self, mask, new, align=True, inplace=False): # need to go column by column new_blocks = [] - - def create_block(v, m, n, item, reshape=True): - """ return a new block, try to preserve dtype if possible """ - - # n should be the length of the mask or a scalar here - if not is_list_like(n): - n = np.array([n] * len(m)) - - # see if we are only masking values that if putted - # will work in the current dtype - nv = None - try: - nn = n[m] - nn_at = nn.astype(self.dtype) - if (nn == nn_at).all(): - nv = v.copy() - nv[mask] = nn_at - except (ValueError, IndexError, TypeError): - pass - - # change the dtype - if nv is None: - dtype, _ = com._maybe_promote(n.dtype) - nv = v.astype(dtype) - try: - nv[m] = n - except ValueError: - idx, = np.where(np.squeeze(m)) - for mask_index, new_val in zip(idx, n): - nv[mask_index] = new_val - - if reshape: - nv = _block_shape(nv) - return make_block(nv, [item], self.ref_items) - else: - return make_block(nv, item, self.ref_items) - if self.ndim > 1: - for i, item in enumerate(self.items): + for i, ref_loc in enumerate(self.mgr_locs): m = mask[i] v = new_values[i] @@ -792,27 +590,31 @@ def create_block(v, m, n, item, reshape=True): # we need to exiplicty astype here to make a copy n = n.astype(dtype) - block = create_block(v, m, n, item) - + nv = _putmask_smart(v, m, n) else: nv = v if inplace else v.copy() - nv = _block_shape(nv) - block = make_block( - nv, Index([item]), self.ref_items, fastpath=True) + + # Put back the dimension that was taken from it and make + # a block out of the result. + block = make_block(values=nv[np.newaxis], + placement=[ref_loc], + fastpath=True) new_blocks.append(block) else: - new_blocks.append(create_block(new_values, mask, new, - self.items, reshape=False)) + nv = _putmask_smart(new_values, mask, new) + new_blocks.append(make_block(values=nv, + placement=self.mgr_locs, + fastpath=True)) return new_blocks if inplace: return [self] - return [make_block(new_values, self.items, self.ref_items, - placement=self._ref_locs, fastpath=True)] + return [make_block(new_values, + placement=self.mgr_locs, fastpath=True)] def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, @@ -891,9 +693,9 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, dtype=self.dtype) values = self._try_coerce_result(values) - blocks = [make_block(values, self.items, self.ref_items, + blocks = [make_block(values, ndim=self.ndim, klass=self.__class__, - fastpath=True)] + fastpath=True, placement=self.mgr_locs)] return self._maybe_downcast(blocks, downcast) def _interpolate(self, method=None, index=None, values=None, @@ -930,36 +732,49 @@ def func(x): # interp each column independently interp_values = np.apply_along_axis(func, axis, data) - blocks = [make_block(interp_values, self.items, self.ref_items, - ndim=self.ndim, klass=self.__class__, fastpath=True)] + blocks = [make_block(interp_values, + ndim=self.ndim, klass=self.__class__, + fastpath=True, placement=self.mgr_locs)] return self._maybe_downcast(blocks, downcast) - def take(self, indexer, ref_items, new_axis, axis=1): - if axis < 1: - raise AssertionError('axis must be at least 1, got %d' % axis) - new_values = com.take_nd(self.values, indexer, axis=axis, - allow_fill=False) + def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb - # need to preserve the ref_locs and just shift them - # GH6121 - ref_locs = None - if not new_axis.is_unique: - ref_locs = self._ref_locs + """ + if fill_tuple is None: + fill_value = self.fill_value + new_values = com.take_nd(self.get_values(), indexer, axis=axis, + allow_fill=False) + else: + fill_value = fill_tuple[0] + new_values = com.take_nd(self.get_values(), indexer, axis=axis, + allow_fill=True, fill_value=fill_value) + + if new_mgr_locs is None: + if axis == 0: + slc = lib.indexer_as_slice(indexer) + if slc is not None: + new_mgr_locs = self.mgr_locs[slc] + else: + new_mgr_locs = self.mgr_locs[indexer] + else: + new_mgr_locs = self.mgr_locs - return [make_block(new_values, self.items, ref_items, ndim=self.ndim, - klass=self.__class__, placement=ref_locs, fastpath=True)] + if new_values.dtype != self.dtype: + return make_block(new_values, new_mgr_locs) + else: + return self.make_block_same_class(new_values, new_mgr_locs) def get_values(self, dtype=None): return self.values - def get_merge_length(self): - return len(self.values) - def diff(self, n): """ return block for the diff of the values """ new_values = com.diff(self.values, n, axis=1) - return [make_block(new_values, self.items, self.ref_items, - ndim=self.ndim, fastpath=True)] + return [make_block(values=new_values, + ndim=self.ndim, fastpath=True, + placement=self.mgr_locs)] def shift(self, periods, axis=0): """ shift the block by periods, possibly upcast """ @@ -983,8 +798,9 @@ def shift(self, periods, axis=0): if f_ordered: new_values = new_values.T - return [make_block(new_values, self.items, self.ref_items, - ndim=self.ndim, fastpath=True)] + return [make_block(new_values, + ndim=self.ndim, fastpath=True, + placement=self.mgr_locs)] def eval(self, func, other, raise_on_error=True, try_cast=False): """ @@ -1003,11 +819,8 @@ def eval(self, func, other, raise_on_error=True, try_cast=False): """ values = self.values - # see if we can align other if hasattr(other, 'reindex_axis'): - axis = getattr(other, '_info_axis_number', 0) - other = other.reindex_axis( - self.items, axis=axis, copy=False).values + other = other.values # make sure that we can broadcast is_transposed = False @@ -1078,8 +891,8 @@ def handle_error(): if try_cast: result = self._try_cast_result(result) - return [make_block(result, self.items, self.ref_items, ndim=self.ndim, - fastpath=True)] + return [make_block(result, ndim=self.ndim, + fastpath=True, placement=self.mgr_locs)] def where(self, other, cond, align=True, raise_on_error=True, try_cast=False): @@ -1103,12 +916,7 @@ def where(self, other, cond, align=True, raise_on_error=True, # see if we can align other if hasattr(other, 'reindex_axis'): - if align: - axis = getattr(other, '_info_axis_number', 0) - other = other.reindex_axis(self.items, axis=axis, - copy=True).values - else: - other = other.values + other = other.values # make sure that we can broadcast is_transposed = False @@ -1129,10 +937,7 @@ def where(self, other, cond, align=True, raise_on_error=True, raise ValueError( "where must have a condition that is ndarray like") - if align and hasattr(cond, 'reindex_axis'): - axis = getattr(cond, '_info_axis_number', 0) - cond = cond.reindex_axis(self.items, axis=axis, copy=True).values - else: + if hasattr(cond, 'reindex_axis'): cond = cond.values # may need to undo transpose of values @@ -1177,8 +982,8 @@ def func(c, v, o): if try_cast: result = self._try_cast_result(result) - return make_block(result, self.items, self.ref_items, - ndim=self.ndim) + return make_block(result, + ndim=self.ndim, placement=self.mgr_locs) # might need to separate out blocks axis = cond.ndim - 1 @@ -1189,11 +994,10 @@ def func(c, v, o): result_blocks = [] for m in [mask, ~mask]: if m.any(): - items = self.items[m] - slices = [slice(None)] * cond.ndim - slices[axis] = self.items.get_indexer(items) - r = self._try_cast_result(result[slices]) - result_blocks.append(make_block(r.T, items, self.ref_items)) + r = self._try_cast_result( + result.take(m.nonzero()[0], axis=axis)) + result_blocks.append(make_block(r.T, + placement=self.mgr_locs[m])) return result_blocks @@ -1203,11 +1007,13 @@ def equals(self, other): class NumericBlock(Block): + __slots__ = () is_numeric = True _can_hold_na = True class FloatOrComplexBlock(NumericBlock): + __slots__ = () def equals(self, other): if self.dtype != other.dtype or self.shape != other.shape: return False @@ -1215,6 +1021,7 @@ def equals(self, other): return ((left == right) | (np.isnan(left) & np.isnan(right))).all() class FloatBlock(FloatOrComplexBlock): + __slots__ = () is_float = True _downcast_dtype = 'int64' @@ -1255,6 +1062,7 @@ def should_store(self, value): class ComplexBlock(FloatOrComplexBlock): + __slots__ = () is_complex = True def _can_hold_element(self, element): @@ -1275,6 +1083,7 @@ def should_store(self, value): class IntBlock(NumericBlock): + __slots__ = () is_integer = True _can_hold_na = False @@ -1295,6 +1104,7 @@ def should_store(self, value): class TimeDeltaBlock(IntBlock): + __slots__ = () is_timedelta = True _can_hold_na = True is_numeric = False @@ -1379,6 +1189,7 @@ def to_native_types(self, slicer=None, na_rep=None, **kwargs): class BoolBlock(NumericBlock): + __slots__ = () is_bool = True _can_hold_na = False @@ -1406,16 +1217,18 @@ def replace(self, to_replace, value, inplace=False, filter=None, inplace=inplace, filter=filter, regex=regex) + class ObjectBlock(Block): + __slots__ = () is_object = True _can_hold_na = True - def __init__(self, values, items, ref_items, ndim=2, fastpath=False, + def __init__(self, values, ndim=2, fastpath=False, placement=None): if issubclass(values.dtype.type, compat.string_types): values = np.array(values, dtype=object) - super(ObjectBlock, self).__init__(values, items, ref_items, ndim=ndim, + super(ObjectBlock, self).__init__(values, ndim=ndim, fastpath=fastpath, placement=placement) @@ -1436,11 +1249,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T """ # attempt to create new type blocks - is_unique = self.items.is_unique blocks = [] if by_item and not self._is_single_block: - for i, c in enumerate(self.items): + for i, rl in enumerate(self.mgr_locs): values = self.iget(i) values = com._possibly_convert_objects( @@ -1449,10 +1261,8 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T convert_timedeltas=convert_timedeltas, ).reshape(values.shape) values = _block_shape(values, ndim=self.ndim) - items = self.items.take([i]) - placement = None if is_unique else [i] - newb = make_block(values, items, self.ref_items, - ndim=self.ndim, placement=placement) + newb = make_block(values, + ndim=self.ndim, placement=[rl]) blocks.append(newb) else: @@ -1461,12 +1271,12 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T self.values.ravel(), convert_dates=convert_dates, convert_numeric=convert_numeric ).reshape(self.values.shape) - blocks.append(make_block(values, self.items, self.ref_items, - ndim=self.ndim)) + blocks.append(make_block(values, + ndim=self.ndim, placement=self.mgr_locs)) return blocks - def set(self, item, value, check=False): + def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -1475,26 +1285,24 @@ def set(self, item, value, check=False): None """ - loc = self.items.get_loc(item) - # GH6026 if check: try: - if (self.values[loc] == value).all(): + if (self.values[locs] == values).all(): return except: pass try: - self.values[loc] = value + self.values[locs] = values except (ValueError): # broadcasting error # see GH6171 - new_shape = list(value.shape) + new_shape = list(values.shape) new_shape[0] = len(self.items) self.values = np.empty(tuple(new_shape),dtype=self.dtype) self.values.fill(np.nan) - self.values[loc] = value + self.values[locs] = values def _maybe_downcast(self, blocks, downcast=None): @@ -1613,27 +1421,29 @@ def re_replacer(s): f = np.vectorize(re_replacer, otypes=[self.dtype]) - try: - filt = lmap(self.items.get_loc, filter) - except TypeError: + if filter is None: filt = slice(None) + else: + filt = self.mgr_locs.isin(filter).nonzero()[0] new_values[filt] = f(new_values[filt]) - return [self if inplace else make_block(new_values, self.items, - self.ref_items, fastpath=True)] + return [self if inplace else + make_block(new_values, + fastpath=True, placement=self.mgr_locs)] class DatetimeBlock(Block): + __slots__ = () is_datetime = True _can_hold_na = True - def __init__(self, values, items, ref_items, fastpath=False, - placement=None, **kwargs): + def __init__(self, values, placement, + fastpath=False, **kwargs): if values.dtype != _NS_DTYPE: values = tslib.cast_to_nanoseconds(values) - super(DatetimeBlock, self).__init__(values, items, ref_items, + super(DatetimeBlock, self).__init__(values, fastpath=True, placement=placement, **kwargs) @@ -1705,7 +1515,8 @@ def fillna(self, value, limit=None, np.putmask(values, mask, value) return [self if inplace else - make_block(values, self.items, self.ref_items, fastpath=True)] + make_block(values, + fastpath=True, placement=self.mgr_locs)] def to_native_types(self, slicer=None, na_rep=None, date_format=None, **kwargs): @@ -1745,7 +1556,7 @@ def astype(self, dtype, copy=False, raise_on_error=True): return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, klass=klass) - def set(self, item, value, check=False): + def set(self, locs, values, check=False): """ Modify Block in-place with new item value @@ -1753,12 +1564,11 @@ def set(self, item, value, check=False): ------- None """ - loc = self.items.get_loc(item) - - if value.dtype != _NS_DTYPE: - value = tslib.cast_to_nanoseconds(value) + if values.dtype != _NS_DTYPE: + # Workaround for numpy 1.6 bug + values = tslib.cast_to_nanoseconds(values) - self.values[loc] = value + self.values[locs] = values def get_values(self, dtype=None): # return object dtype as Timestamps @@ -1769,9 +1579,8 @@ def get_values(self, dtype=None): class SparseBlock(Block): - """ implement as a list of sparse arrays of the same dtype """ - __slots__ = ['items', 'ref_items', '_ref_locs', 'ndim', 'values'] + __slots__ = () is_sparse = True is_numeric = True _can_hold_na = True @@ -1779,34 +1588,27 @@ class SparseBlock(Block): _verify_integrity = False _ftype = 'sparse' - def __init__(self, values, items, ref_items, ndim=None, fastpath=False, - placement=None): + def __init__(self, values, placement, + ndim=None, fastpath=False,): # kludgetastic - if ndim is not None: - if ndim == 1: - ndim = 1 - elif ndim > 2: - ndim = ndim - else: - if len(items) != 1: + if ndim is None: + if len(placement) != 1: ndim = 1 else: ndim = 2 self.ndim = ndim - self._ref_locs = None + self.mgr_locs = placement + + if not isinstance(values, SparseArray): + raise TypeError("values must be SparseArray") + self.values = values - if fastpath: - self.items = items - self.ref_items = ref_items - else: - self.items = _ensure_index(items) - self.ref_items = _ensure_index(ref_items) @property def shape(self): - return (len(self.items), self.sp_index.length) + return (len(self.mgr_locs), self.sp_index.length) @property def itemsize(self): @@ -1814,6 +1616,7 @@ def itemsize(self): @property def fill_value(self): + #return np.nan return self.values.fill_value @fill_value.setter @@ -1832,7 +1635,13 @@ def sp_values(self, v): # reset the sparse values self.values = SparseArray(v, sparse_index=self.sp_index, kind=self.kind, dtype=v.dtype, - fill_value=self.fill_value, copy=False) + fill_value=self.values.fill_value, + copy=False) + + def iget(self, col): + if col != 0: + raise IndexError("SparseBlock only contains one item") + return self.values @property def sp_index(self): @@ -1851,15 +1660,9 @@ def __len__(self): def should_store(self, value): return isinstance(value, SparseArray) - def prepare_for_merge(self, **kwargs): - """ create a dense block """ - return make_block(self.get_values(), self.items, self.ref_items) - - def post_merge(self, items, **kwargs): - return self - - def set(self, item, value, check=False): - self.values = value + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values def get(self, item): if self.ndim == 1: @@ -1879,33 +1682,52 @@ def get_values(self, dtype=None): values = values.reshape((1,) + values.shape) return values - def get_merge_length(self): - return 1 - - def make_block(self, values, items=None, ref_items=None, sparse_index=None, - kind=None, dtype=None, fill_value=None, copy=False, - fastpath=True): + def copy(self, deep=True): + return self.make_block_same_class(values=self.values, + sparse_index=self.sp_index, + kind=self.kind, copy=deep, + placement=self.mgr_locs) + + def make_block_same_class(self, values, placement, + sparse_index=None, kind=None, dtype=None, + fill_value=None, copy=False, fastpath=True): """ return a new block """ if dtype is None: dtype = self.dtype if fill_value is None: - fill_value = self.fill_value - if items is None: - items = self.items - if ref_items is None: - ref_items = self.ref_items + fill_value = self.values.fill_value + + # if not isinstance(values, SparseArray) and values.ndim != self.ndim: + # raise ValueError("ndim mismatch") + + if values.ndim == 2: + nitems = values.shape[0] + + if nitems == 0: + # kludgy, but SparseBlocks cannot handle slices, where the + # output is 0-item, so let's convert it to a dense block: it + # won't take space since there's 0 items, plus it will preserve + # the dtype. + return make_block(np.empty(values.shape, dtype=dtype), + placement, fastpath=True,) + elif nitems > 1: + raise ValueError("Only 1-item 2d sparse blocks are supported") + else: + values = values.reshape(values.shape[1]) + new_values = SparseArray(values, sparse_index=sparse_index, kind=kind or self.kind, dtype=dtype, fill_value=fill_value, copy=copy) - return make_block(new_values, items, ref_items, ndim=self.ndim, - fastpath=fastpath) + return make_block(new_values, ndim=self.ndim, + fastpath=fastpath, placement=placement) def interpolate(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, **kwargs): values = com.interpolate_2d( self.values.to_dense(), method, axis, limit, fill_value) - return self.make_block(values, self.items, self.ref_items) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) def fillna(self, value, limit=None, inplace=False, downcast=None): # we may need to upcast our fill to match our dtype @@ -1914,8 +1736,9 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): if issubclass(self.dtype.type, np.floating): value = float(value) values = self.values if inplace else self.values.copy() - return [self.make_block(values.get_values(value), fill_value=value)] - + return [self.make_block_same_class(values=values.get_values(value), + fill_value=value, + placement=self.mgr_locs)] def shift(self, periods, axis=0): """ shift the block by periods """ @@ -1933,15 +1756,7 @@ def shift(self, periods, axis=0): new_values[:periods] = fill_value else: new_values[periods:] = fill_value - return [self.make_block(new_values)] - - def take(self, indexer, ref_items, new_axis, axis=1): - """ going to take our items - along the long dimension""" - if axis < 1: - raise AssertionError('axis must be at least 1, got %d' % axis) - - return [self.make_block(self.values.take(indexer))] + return [self.make_block_same_class(new_values, placement=self.mgr_locs)] def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, limit=None, mask_info=None): @@ -1954,53 +1769,9 @@ def reindex_axis(self, indexer, method=None, axis=1, fill_value=None, # taking on the 0th axis always here if fill_value is None: fill_value = self.fill_value - return self.make_block(self.values.take(indexer), items=self.items, - fill_value=fill_value) - - def reindex_items_from(self, new_ref_items, indexer=None, method=None, - fill_value=None, limit=None, copy=True): - """ - Reindex to only those items contained in the input set of items - - E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'], - then the resulting items will be ['b'] - - Returns - ------- - reindexed : Block - """ - - # 1-d always - if indexer is None: - new_ref_items, indexer = self.items.reindex(new_ref_items, - limit=limit) - if indexer is None: - indexer = np.arange(len(self.items)) - - # single block - if self.ndim == 1: - - new_items = new_ref_items - new_values = com.take_1d(self.values.values, indexer) - - else: - - # if we don't overlap at all, then don't include this block - new_items = self.items & new_ref_items - if not len(new_items): - return None - - new_values = self.values.values - - # fill if needed - if method is not None or limit is not None: - if fill_value is None: - fill_value = self.fill_value - new_values = com.interpolate_2d(new_values, method=method, - limit=limit, fill_value=fill_value) - - return self.make_block(new_values, items=new_items, - ref_items=new_ref_items, copy=copy) + return self.make_block_same_class(self.values.take(indexer), + fill_value=fill_value, + placement=self.mgr_locs) def sparse_reindex(self, new_index): """ sparse reindex and return a new block @@ -2008,19 +1779,15 @@ def sparse_reindex(self, new_index): values = self.values values = values.sp_index.to_int_index().reindex( values.sp_values.astype('float64'), values.fill_value, new_index) - return self.make_block(values, sparse_index=new_index) - - def split_block_at(self, item): - if len(self.items) == 1 and item == self.items[0]: - return [] - return super(SparseBlock, self).split_block_at(self, item) + return self.make_block_same_class(values, sparse_index=new_index, + placement=self.mgr_locs) def _try_cast_result(self, result, dtype=None): return result -def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, - fastpath=False, placement=None): +def make_block(values, placement, klass=None, ndim=None, + dtype=None, fastpath=False): if klass is None: dtype = dtype or values.dtype vtype = dtype.type @@ -2066,7 +1833,7 @@ def make_block(values, items, ref_items, klass=None, ndim=None, dtype=None, if klass is None: klass = ObjectBlock - return klass(values, items, ref_items, ndim=ndim, fastpath=fastpath, + return klass(values, ndim=ndim, fastpath=fastpath, placement=placement) @@ -2082,6 +1849,42 @@ class BlockManager(PandasObject): lightweight blocked set of labeled data to be manipulated by the DataFrame public API class + Attributes + ---------- + shape + ndim + axes + values + items + + Methods + ------- + set_axis(axis, new_labels) + copy(deep=True) + + get_dtype_counts + get_ftype_counts + get_dtypes + get_ftypes + + apply(func, axes, block_filter_fn) + + get_bool_data + get_numeric_data + + get_slice(slice_like, axis) + get(label) + iget(loc) + get_scalar(label_tup) + + take(indexer, axis) + reindex_axis(new_labels, axis) + reindex_indexer(new_labels, indexer, axis) + + delete(label) + insert(loc, label, value) + set(label, value) + Parameters ---------- @@ -2091,28 +1894,28 @@ class BlockManager(PandasObject): This is *not* a public API class """ __slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated', - '_is_consolidated', '_has_sparse', '_ref_locs', '_items_map'] + '_is_consolidated', '_blknos', '_blklocs'] def __init__(self, blocks, axes, do_integrity_check=True, fastpath=True): self.axes = [_ensure_index(ax) for ax in axes] - self.blocks = blocks + self.blocks = tuple(blocks) - ndim = self.ndim for block in blocks: - if not block.is_sparse and ndim != block.ndim: - raise AssertionError(('Number of Block dimensions (%d) must ' - 'equal number of axes (%d)') - % (block.ndim, ndim)) + if block.is_sparse: + if len(block.mgr_locs) != 1: + raise AssertionError("Sparse block refers to multiple items") + else: + if self.ndim != block.ndim: + raise AssertionError(('Number of Block dimensions (%d) must ' + 'equal number of axes (%d)') + % (block.ndim, self.ndim)) if do_integrity_check: self._verify_integrity() - self._has_sparse = False self._consolidate_check() - # we have a duplicate items index, setup the block maps - if not self.items.is_unique: - self._set_ref_locs(do_refs=True) + self._rebuild_blknos_and_blklocs() def make_empty(self, axes=None): """ return an empty BlockManager with the items axis of len 0 """ @@ -2136,182 +1939,77 @@ def __nonzero__(self): @property def shape(self): - if getattr(self, '_shape', None) is None: - self._shape = tuple(len(ax) for ax in self.axes) - return self._shape + return tuple(len(ax) for ax in self.axes) @property def ndim(self): - if getattr(self, '_ndim', None) is None: - self._ndim = len(self.axes) - return self._ndim + return len(self.axes) - def _set_axis(self, axis, value, check_axis=True): - cur_axis = self.axes[axis] - value = _ensure_index(value) + def set_axis(self, axis, new_labels): + new_labels = _ensure_index(new_labels) + old_len = len(self.axes[axis]) + new_len = len(new_labels) - if check_axis and len(value) != len(cur_axis): + if new_len != old_len: raise ValueError('Length mismatch: Expected axis has %d elements, ' - 'new values have %d elements' % (len(cur_axis), - len(value))) - - self.axes[axis] = value - self._shape = None - return cur_axis, value - - def set_axis(self, axis, value, maybe_rename=True, check_axis=True): - cur_axis, value = self._set_axis(axis, value, check_axis) - - if axis == 0: - - # set/reset ref_locs based on the current index - # and map the new index if needed - self._set_ref_locs(labels=cur_axis) - - # take via ref_locs - for block in self.blocks: - block.set_ref_items(self.items, maybe_rename=maybe_rename) - - # set/reset ref_locs based on the new index - self._set_ref_locs(labels=value, do_refs=True) - - def _reset_ref_locs(self): - """ take the current _ref_locs and reset ref_locs on the blocks - to correctly map, ignoring Nones; - reset both _items_map and _ref_locs """ - - # let's reset the ref_locs in individual blocks - if self.items.is_unique: - for b in self.blocks: - b._ref_locs = None - else: - for b in self.blocks: - b.reset_ref_locs() - self._rebuild_ref_locs() + 'new values have %d elements' % (old_len, new_len)) - self._ref_locs = None - self._items_map = None + self.axes[axis] = new_labels - def _rebuild_ref_locs(self): - """Take _ref_locs and set the individual block ref_locs, skipping Nones - no effect on a unique index + def rename_axis(self, mapper, axis, copy=True): """ - if getattr(self, '_ref_locs', None) is not None: - item_count = 0 - for v in self._ref_locs: - if v is not None: - block, item_loc = v - if block._ref_locs is None: - block.reset_ref_locs() - block._ref_locs[item_loc] = item_count - item_count += 1 - - def _set_ref_locs(self, labels=None, do_refs=False): - """ - if we have a non-unique index on this axis, set the indexers - we need to set an absolute indexer for the blocks - return the indexer if we are not unique + Rename one of axes. - labels : the (new) labels for this manager - ref : boolean, whether to set the labels (one a 1-1 mapping) + Parameters + ---------- + mapper : unary callable + axis : int + copy : boolean, default True """ + obj = self.copy(deep=copy) + obj.set_axis(axis, _transform_index(self.axes[axis], mapper)) + return obj - if labels is None: - labels = self.items - - # we are unique, and coming from a unique - is_unique = labels.is_unique - if is_unique and not do_refs: - - if not self.items.is_unique: - - # reset our ref locs - self._ref_locs = None - for b in self.blocks: - b._ref_locs = None + def add_prefix(self, prefix): + f = (str(prefix) + '%s').__mod__ + return self.rename_axis(f, axis=0) - return None + def add_suffix(self, suffix): + f = ('%s' + str(suffix)).__mod__ + return self.rename_axis(f, axis=0) - # we are going to a non-unique index - # we have ref_locs on the block at this point - if (not is_unique and do_refs) or do_refs == 'force': + @property + def _is_single_block(self): + if self.ndim == 1: + return True - # create the items map - im = getattr(self, '_items_map', None) - if im is None: + if len(self.blocks) != 1: + return False - im = dict() - for block in self.blocks: + blk = self.blocks[0] + return (blk.mgr_locs.is_slice_like and + blk.mgr_locs.as_slice == slice(0, len(self), 1)) - # if we have a duplicate index but - # _ref_locs have not been set - try: - rl = block.ref_locs - except: - raise AssertionError( - 'Cannot create BlockManager._ref_locs because ' - 'block [%s] with duplicate items [%s] does not ' - 'have _ref_locs set' % (block, labels)) - - m = maybe_create_block_in_items_map(im, block) - for i, item in enumerate(block.items): - m[i] = rl[i] - - self._items_map = im - - # create the _ref_loc map here - rl = [None] * len(labels) - for block, items in im.items(): - for i, loc in enumerate(items): - rl[loc] = (block, i) - self._ref_locs = rl - return rl - - elif do_refs: - self._reset_ref_locs() - - # return our cached _ref_locs (or will compute again - # when we recreate the block manager if needed - return getattr(self, '_ref_locs', None) - - def get_items_map(self, use_cached=True): + def _rebuild_blknos_and_blklocs(self): """ - return an inverted ref_loc map for an item index - block -> item (in that block) location -> column location - - use_cached : boolean, use the cached items map, or recreate + Update mgr._blknos / mgr._blklocs. """ + new_blknos = np.empty(self.shape[0], dtype=np.int64) + new_blklocs = np.empty(self.shape[0], dtype=np.int64) + new_blknos.fill(-1) + new_blklocs.fill(-1) - # cache check - if use_cached: - im = getattr(self, '_items_map', None) - if im is not None: - return im + for blkno, blk in enumerate(self.blocks): + rl = blk.mgr_locs + new_blknos[rl.indexer] = blkno + new_blklocs[rl.indexer] = np.arange(len(rl)) - im = dict() - rl = self._set_ref_locs() + if (new_blknos == -1).any(): + raise AssertionError("Gaps in blk ref_locs") - # we have a non-duplicative index - if rl is None: - - axis = self.axes[0] - for block in self.blocks: - - m = maybe_create_block_in_items_map(im, block) - for i, item in enumerate(block.items): - m[i] = axis.get_loc(item) - - # use the ref_locs to construct the map - else: - - for i, (block, idx) in enumerate(rl): - - m = maybe_create_block_in_items_map(im, block) - m[idx] = i - - self._items_map = im - return im + self._blknos = new_blknos + self._blklocs = new_blklocs # make items read only for now def _get_items(self): @@ -2327,23 +2025,6 @@ def _get_counts(self, f): counts[v] = counts.get(v, 0) + b.shape[0] return counts - def _get_types(self, f): - """ return a list of the f per item """ - self._consolidate_inplace() - - # unique - if self.items.is_unique: - l = [ None ] * len(self.items) - for b in self.blocks: - v = f(b) - for rl in b.ref_locs: - l[rl] = v - return l - - # non-unique - ref_locs = self._set_ref_locs() - return [ f(ref_locs[i][0]) for i, item in enumerate(self.items) ] - def get_dtype_counts(self): return self._get_counts(lambda b: b.dtype.name) @@ -2351,14 +2032,16 @@ def get_ftype_counts(self): return self._get_counts(lambda b: b.ftype) def get_dtypes(self): - return self._get_types(lambda b: b.dtype) + dtypes = np.array([blk.dtype for blk in self.blocks]) + return dtypes.take(self._blknos) def get_ftypes(self): - return self._get_types(lambda b: b.ftype) + ftypes = np.array([blk.ftype for blk in self.blocks]) + return ftypes.take(self._blknos) def __getstate__(self): block_values = [b.values for b in self.blocks] - block_items = [b.items for b in self.blocks] + block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks] axes_array = [ax for ax in self.axes] return axes_array, block_values, block_items @@ -2376,16 +2059,17 @@ def __setstate__(self, state): if values.dtype == 'M8[us]': values = values.astype('M8[ns]') - blk = make_block(values, items, self.axes[0]) + blk = make_block(values, + placement=self.axes[0].get_indexer(items)) blocks.append(blk) - self.blocks = blocks + self.blocks = tuple(blocks) self._post_setstate() def _post_setstate(self): self._is_consolidated = False self._known_consolidated = False - self._set_has_sparse() + self._rebuild_blknos_and_blklocs() def __len__(self): return len(self.items) @@ -2394,24 +2078,20 @@ def __unicode__(self): output = com.pprint_thing(self.__class__.__name__) for i, ax in enumerate(self.axes): if i == 0: - output += '\nItems: %s' % ax + output += u('\nItems: %s') % ax else: - output += '\nAxis %d: %s' % (i, ax) + output += u('\nAxis %d: %s') % (i, ax) for block in self.blocks: - output += '\n%s' % com.pprint_thing(block) + output += u('\n%s') % com.pprint_thing(block) return output def _verify_integrity(self): mgr_shape = self.shape - tot_items = sum(len(x.items) for x in self.blocks) + tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if block.ref_items is not self.items: - raise AssertionError("Block ref_items must be BlockManager " - "items") - if not block.is_sparse and block.values.shape[1:] != mgr_shape[1:]: - construction_error( - tot_items, block.values.shape[1:], self.axes) + if not block.is_sparse and block.shape[1:] != mgr_shape[1:]: + construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError('Number of manager items must equal union of ' 'block items\n# manager items: {0}, # ' @@ -2437,18 +2117,57 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, **kwargs): """ result_blocks = [] - for blk in self.blocks: + + # filter kwarg is used in replace-* family of methods + if filter is not None: + filter_locs = set(self.items.get_indexer_for(filter)) + if len(filter_locs) == len(self.items): + # All items are included, as if there were no filtering + filter = None + else: + kwargs['filter'] = filter_locs + + if f == 'where' and kwargs.get('align', True): + align_copy = True + align_keys = ['other', 'cond'] + elif f == 'putmask' and kwargs.get('align', True): + align_copy = False + align_keys = ['new', 'mask'] + elif f == 'eval': + align_copy = False + align_keys = ['other'] + elif f == 'fillna': + # fillna internally does putmask, maybe it's better to do this + # at mgr, not block level? + align_copy = False + align_keys = ['value'] + else: + align_keys = [] + + aligned_args = dict((k, kwargs[k]) for k in align_keys + if hasattr(kwargs[k], 'reindex_axis')) + + for b in self.blocks: if filter is not None: - kwargs['filter'] = set(filter) - if not blk.items.isin(filter).any(): - result_blocks.append(blk) + if not b.mgr_locs.isin(filter_locs).any(): + result_blocks.append(b) continue - applied = getattr(blk, f)(**kwargs) + + if aligned_args: + b_items = self.items[b.mgr_locs.indexer] + + for k, obj in aligned_args.items(): + axis = getattr(obj, '_info_axis_number', 0) + kwargs[k] = obj.reindex_axis(b_items, axis=axis, + copy=align_copy) + + applied = getattr(b, f)(**kwargs) if isinstance(applied, list): result_blocks.extend(applied) else: result_blocks.append(applied) + if len(result_blocks) == 0: return self.make_empty(axes or self.axes) bm = self.__class__(result_blocks, axes or self.axes, @@ -2527,7 +2246,7 @@ def comp(s): else: # get our mask for this element, sized to this # particular block - m = masks[i][b.ref_locs] + m = masks[i][b.mgr_locs.indexer] if m.any(): new_rb.extend(b.putmask(m, d, inplace=True)) else: @@ -2539,31 +2258,6 @@ def comp(s): bm._consolidate_inplace() return bm - def prepare_for_merge(self, **kwargs): - """ prepare for merging, return a new block manager with - Sparse -> Dense - """ - self._consolidate_inplace() - if self._has_sparse: - return self.apply('prepare_for_merge', **kwargs) - return self - - def post_merge(self, objs, **kwargs): - """ try to sparsify items that were previously sparse """ - is_sparse = defaultdict(list) - for o in objs: - for blk in o._data.blocks: - if blk.is_sparse: - - # record the dtype of each item - for i in blk.items: - is_sparse[i].append(blk.dtype) - - if len(is_sparse): - return self.apply('post_merge', items=is_sparse) - - return self - def is_consolidated(self): """ Return True if more than one block with the same dtype @@ -2576,10 +2270,6 @@ def _consolidate_check(self): ftypes = [blk.ftype for blk in self.blocks] self._is_consolidated = len(ftypes) == len(set(ftypes)) self._known_consolidated = True - self._set_has_sparse() - - def _set_has_sparse(self): - self._has_sparse = any((blk.is_sparse for blk in self.blocks)) @property def is_mixed_type(self): @@ -2599,163 +2289,66 @@ def is_datelike_mixed_type(self): self._consolidate_inplace() return any([block.is_datelike for block in self.blocks]) - def get_block_map(self, copy=False, typ=None, columns=None, - is_numeric=False, is_bool=False): - """ return a dictionary mapping the ftype -> block list - - Parameters - ---------- - typ : return a list/dict - copy : copy if indicated - columns : a column filter list - filter if the type is indicated """ - - # short circuit - mainly for merging - if (typ == 'dict' and columns is None and not is_numeric and - not is_bool and not copy): - bm = defaultdict(list) - for b in self.blocks: - bm[str(b.ftype)].append(b) - return bm - + def get_bool_data(self, copy=False): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + """ self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_bool], copy) - if is_numeric: - filter_blocks = lambda block: block.is_numeric - elif is_bool: - filter_blocks = lambda block: block.is_bool - else: - filter_blocks = lambda block: True - - def filter_columns(b): - if columns: - if not columns in b.items: - return None - b = b.reindex_items_from(columns) - return b - - maybe_copy = lambda b: b.copy() if copy else b - - def maybe_copy(b): - if copy: - b = b.copy() - return b - - if typ == 'list': - bm = [] - for b in self.blocks: - if filter_blocks(b): - b = filter_columns(b) - if b is not None: - bm.append(maybe_copy(b)) - - else: - if typ == 'dtype': - key = lambda b: b.dtype - else: - key = lambda b: b.ftype - bm = defaultdict(list) - for b in self.blocks: - if filter_blocks(b): - b = filter_columns(b) - if b is not None: - bm[str(key(b))].append(maybe_copy(b)) - return bm - - def get_bool_data(self, **kwargs): - kwargs['is_bool'] = True - return self.get_data(**kwargs) - - def get_numeric_data(self, **kwargs): - kwargs['is_numeric'] = True - return self.get_data(**kwargs) - - def get_data(self, copy=False, columns=None, **kwargs): + def get_numeric_data(self, copy=False): """ Parameters ---------- copy : boolean, default False Whether to copy the blocks """ - blocks = self.get_block_map( - typ='list', copy=copy, columns=columns, **kwargs) - if len(blocks) == 0: - return self.make_empty() - - return self.combine(blocks, copy=copy) + self._consolidate_inplace() + return self.combine([b for b in self.blocks if b.is_numeric], copy) def combine(self, blocks, copy=True): """ return a new manager with the blocks """ - indexer = np.sort(np.concatenate([b.ref_locs for b in blocks])) + if len(blocks) == 0: + return self.make_empty() + + # FIXME: optimization potential + indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) + inv_indexer = _invert_reordering(indexer) new_items = self.items.take(indexer) new_blocks = [] for b in blocks: - b = b.reindex_items_from(new_items, copy=copy) - new_blocks.extend(_valid_blocks(b)) + b = b.copy(deep=copy) + b.mgr_locs = inv_indexer.take(b.mgr_locs.as_array) + new_blocks.append(b) + new_axes = list(self.axes) new_axes[0] = new_items return self.__class__(new_blocks, new_axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): - new_axes = list(self.axes) - - new_axes[axis] = new_axes[axis][slobj] + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") if axis == 0: - new_items = new_axes[0] - - # we want to preserver the view of a single-block - if len(self.blocks) == 1: - - blk = self.blocks[0] - ref_locs = blk.take_ref_locs(slobj) - newb = make_block(blk._slice(slobj), new_items, new_items, - klass=blk.__class__, fastpath=True, - placement=ref_locs) - - new_blocks = [newb] - else: - return self.reindex_items( - new_items, indexer=np.arange(len(self.items))[slobj]) + new_blocks = self._slice_take_blocks_ax0(slobj) else: - new_blocks = self._slice_blocks(slobj, axis) + slicer = [slice(None)] * (axis + 1) + slicer[axis] = slobj + slicer = tuple(slicer) + new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] - bm = self.__class__(new_blocks, new_axes, do_integrity_check=False) + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] + + bm = self.__class__(new_blocks, new_axes, do_integrity_check=False, + fastpath=True) bm._consolidate_inplace() return bm - def _slice_blocks(self, slobj, axis): - """ - slice the blocks using the provided slice object - this is only for slicing on axis != 0 - """ - - if axis == 0: - raise AssertionError("cannot _slice_blocks on axis=0") - - slicer = [slice(None, None) for _ in range(self.ndim)] - slicer[axis] = slobj - slicer = tuple(slicer) - is_unique = self.axes[0].is_unique - - def place(block): - if not is_unique: - return block._ref_locs - return None - - return [ make_block(block._slice(slicer), - block.items, - block.ref_items, - klass=block.__class__, - fastpath=True, - placement=place(block) - ) for block in self.blocks ] - - def get_series_dict(self): - # For DataFrame - return _blocks_to_series_dict(self.blocks, self.axes[1]) - def __contains__(self, item): return item in self.items @@ -2781,55 +2374,49 @@ def copy(self, deep=True): else: new_axes = list(self.axes) return self.apply('copy', axes=new_axes, deep=deep, - ref_items=new_axes[0], do_integrity_check=False) + do_integrity_check=False) def as_matrix(self, items=None): if len(self.blocks) == 0: - mat = np.empty(self.shape, dtype=float) - elif len(self.blocks) == 1: - blk = self.blocks[0] - if items is None or blk.items.equals(items): - # if not, then just call interleave per below - mat = blk.get_values() - else: - mat = self.reindex_items(items).as_matrix() + return np.empty(self.shape, dtype=float) + + if items is not None: + mgr = self.reindex_axis(items, axis=0) else: - if items is None: - mat = self._interleave(self.items) - else: - mat = self.reindex_items(items).as_matrix() + mgr = self - return mat + if self._is_single_block: + return mgr.blocks[0].get_values() + else: + return mgr._interleave() - def _interleave(self, items): + def _interleave(self): """ Return ndarray from blocks with specified item order Items must be contained in the blocks """ dtype = _interleaved_dtype(self.blocks) - items = _ensure_index(items) result = np.empty(self.shape, dtype=dtype) - itemmask = np.zeros(len(items), dtype=bool) - # By construction, all of the item should be covered by one of the - # blocks - if items.is_unique: + if result.shape[0] == 0: + # Workaround for numpy 1.7 bug: + # + # >>> a = np.empty((0,10)) + # >>> a[slice(0,0)] + # array([], shape=(0, 10), dtype=float64) + # >>> a[[]] + # Traceback (most recent call last): + # File "", line 1, in + # IndexError: index 0 is out of bounds for axis 0 with size 0 + return result - for block in self.blocks: - indexer = items.get_indexer(block.items) - if (indexer == -1).any(): - raise AssertionError('Items must contain all block items') - result[indexer] = block.get_values(dtype) - itemmask[indexer] = 1 + itemmask = np.zeros(self.shape[0]) - else: - - # non-unique, must use ref_locs - rl = self._set_ref_locs() - for i, (block, idx) in enumerate(rl): - result[i] = block.get_values(dtype)[idx] - itemmask[i] = 1 + for blk in self.blocks: + rl = blk.mgr_locs + result[rl.indexer] = blk.get_values(dtype) + itemmask[rl.indexer] = 1 if not itemmask.all(): raise AssertionError('Some items were not contained in blocks') @@ -2863,22 +2450,17 @@ def xs(self, key, axis=1, copy=True, takeable=False): if len(self.blocks) > 1: # we must copy here as we are mixed type for blk in self.blocks: - newb = make_block(blk.values[slicer], - blk.items, - blk.ref_items, - klass=blk.__class__, - fastpath=True) + newb = make_block(values=blk.values[slicer], + klass=blk.__class__, fastpath=True, + placement=blk.mgr_locs) new_blocks.append(newb) elif len(self.blocks) == 1: block = self.blocks[0] vals = block.values[slicer] if copy: vals = vals.copy() - new_blocks = [make_block(vals, - self.items, - self.items, - klass=block.__class__, - fastpath=True)] + new_blocks = [make_block(values=vals, placement=block.mgr_locs, + klass=block.__class__, fastpath=True,)] return self.__class__(new_blocks, new_axes) @@ -2897,7 +2479,7 @@ def fast_xs(self, loc): # non-unique (GH4726) if not items.is_unique: - result = self._interleave(items) + result = self._interleave() if self.ndim == 2: result = result.T return result[loc] @@ -2907,9 +2489,10 @@ def fast_xs(self, loc): n = len(items) result = np.empty(n, dtype=dtype) for blk in self.blocks: - for j, item in enumerate(blk.items): - i = items.get_loc(item) - result[i] = blk._try_coerce_result(blk.iget((j, loc))) + # Such assignment may incorrectly coerce NaT to None + # result[blk.mgr_locs] = blk._slice((slice(None), loc)) + for i, rl in enumerate(blk.mgr_locs): + result[rl] = blk._try_coerce_result(blk.iget((i, loc))) return result @@ -2930,112 +2513,92 @@ def consolidate(self): def _consolidate_inplace(self): if not self.is_consolidated(): - self.blocks = _consolidate(self.blocks, self.items) - - # reset our mappings - if not self.items.is_unique: - self._ref_locs = None - self._items_map = None - self._set_ref_locs(do_refs=True) + self.blocks = tuple(_consolidate(self.blocks)) self._is_consolidated = True self._known_consolidated = True - self._set_has_sparse() + self._rebuild_blknos_and_blklocs() def get(self, item): + """ + Return values for selected item (ndarray or BlockManager). + """ if self.items.is_unique: - if isnull(item): + if not isnull(item): + loc = self.items.get_loc(item) + else: indexer = np.arange(len(self.items))[isnull(self.items)] - return self.get_for_nan_indexer(indexer) - _, block = self._find_block(item) - return block.get(item) + # allow a single nan location indexer + if not np.isscalar(indexer): + if len(indexer) == 1: + loc = indexer.item() + else: + raise ValueError("cannot label index with a null key") + + return self.iget(loc) else: if isnull(item): raise ValueError("cannot label index with a null key") - indexer = self.items.get_loc(item) - ref_locs = np.array(self._set_ref_locs()) - - # duplicate index but only a single result - if com.is_integer(indexer): - - b, loc = ref_locs[indexer] - values = [b.iget(loc)] - index = Index([self.items[indexer]]) - - # we have a multiple result, potentially across blocks - else: - - values = [block.iget(i) for block, i in ref_locs[indexer]] - index = self.items[indexer] - - # create and return a new block manager - axes = [index] + self.axes[1:] - blocks = form_blocks(values, index, axes) - mgr = BlockManager(blocks, axes) - mgr._consolidate_inplace() - return mgr + indexer = self.items.get_indexer_for([item]) + return self.reindex_indexer(new_axis=self.items[indexer], + indexer=indexer, axis=0, allow_dups=True) def iget(self, i): - item = self.items[i] - - # unique - if self.items.is_unique: - if notnull(item): - return self.get(item) - return self.get_for_nan_indexer(i) - - ref_locs = self._set_ref_locs() - b, loc = ref_locs[i] - return b.iget(loc) - - def get_for_nan_indexer(self, indexer): - - # allow a single nan location indexer - if not np.isscalar(indexer): - if len(indexer) == 1: - indexer = indexer.item() - else: - raise ValueError("cannot label index with a null key") - - # take a nan indexer and return the values - ref_locs = self._set_ref_locs(do_refs='force') - b, loc = ref_locs[indexer] - return b.iget(loc) + return self.blocks[self._blknos[i]].iget(self._blklocs[i]) def get_scalar(self, tup): """ Retrieve single item """ - item = tup[0] - _, blk = self._find_block(item) + full_loc = list(ax.get_loc(x) + for ax, x in zip(self.axes, tup)) + blk = self.blocks[self._blknos[full_loc[0]]] + full_loc[0] = self._blklocs[full_loc[0]] - # this could obviously be seriously sped up in cython - item_loc = blk.items.get_loc(item), - full_loc = item_loc + tuple(ax.get_loc(x) - for ax, x in zip(self.axes[1:], tup[1:])) - return blk.values[full_loc] + # FIXME: this may return non-upcasted types? + return blk.values[tuple(full_loc)] def delete(self, item): + """ + Delete selected item (items if non-unique) in-place. + """ + indexer = self.items.get_loc(item) - is_unique = self.items.is_unique - loc = self.items.get_loc(item) - - # dupe keys may return mask - loc = _possibly_convert_to_indexer(loc) - self._delete_from_all_blocks(loc, item) + is_deleted = np.zeros(self.shape[0], dtype=np.bool_) + is_deleted[indexer] = True + ref_loc_offset = -is_deleted.cumsum() - # _ref_locs, and _items_map are good here - new_items = self.items.delete(loc) - self.set_items_norename(new_items) + is_blk_deleted = [False] * len(self.blocks) - self._known_consolidated = False - - if not is_unique: - self._consolidate_inplace() + if isinstance(indexer, int): + affected_start = indexer + else: + affected_start = is_deleted.nonzero()[0][0] + + for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]): + blk = self.blocks[blkno] + bml = blk.mgr_locs + blk_del = is_deleted[bml.indexer].nonzero()[0] + + if len(blk_del) == len(bml): + is_blk_deleted[blkno] = True + continue + elif len(blk_del) != 0: + blk.delete(blk_del) + bml = blk.mgr_locs + + blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer]) + + # FIXME: use Index.delete as soon as it uses fastpath=True + self.axes[0] = self.items[~is_deleted] + self.blocks = tuple(b for blkno, b in enumerate(self.blocks) + if not is_blk_deleted[blkno]) + self._shape = None + self._rebuild_blknos_and_blklocs() def set(self, item, value, check=False): """ @@ -3043,508 +2606,345 @@ def set(self, item, value, check=False): contained in the current set of items if check, then validate that we are not setting the same data in-place """ - if not isinstance(value, SparseArray): + # FIXME: refactor, clearly separate broadcasting & zip-like assignment + value_is_sparse = isinstance(value, SparseArray) + + if value_is_sparse: + assert self.ndim == 2 + + def value_getitem(placement): + return value + else: if value.ndim == self.ndim - 1: value = value.reshape((1,) + value.shape) + + def value_getitem(placement): + return value + else: + def value_getitem(placement): + return value[placement.indexer] if value.shape[1:] != self.shape[1:]: raise AssertionError('Shape of new values must be compatible ' 'with manager shape') - def _set_item(item, arr): - i, block = self._find_block(item) - if not block.should_store(value): - # delete from block, create and append new block - self._delete_from_block(i, item) - self._add_new_block(item, arr, loc=None) - else: - block.set(item, arr, check=check) - try: - loc = self.items.get_loc(item) - if isinstance(loc, int): - _set_item(self.items[loc], value) + except KeyError: + # This item wasn't present, just insert at end + self.insert(len(self.items), item, value) + return + + if isinstance(loc, int): + loc = [loc] + + blknos = self._blknos[loc] + blklocs = self._blklocs[loc] + + unfit_mgr_locs = [] + unfit_val_locs = [] + removed_blknos = [] + for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + blk = self.blocks[blkno] + blk_locs = blklocs[val_locs.indexer] + if blk.should_store(value): + blk.set(blk_locs, value_getitem(val_locs), check=check) else: - subset = self.items[loc] - if len(value) != len(subset): - raise AssertionError( - 'Number of items to set did not match') + unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) + unfit_val_locs.append(val_locs) - # we are inserting multiple non-unique items as replacements - # we are inserting one by one, so the index can go from unique - # to non-unique during the loop, need to have _ref_locs defined - # at all times - if np.isscalar(item) and (com.is_list_like(loc) or isinstance(loc, slice)): + # If all block items are unfit, schedule the block for removal. + if len(val_locs) == len(blk.mgr_locs): + removed_blknos.append(blkno) + else: + self._blklocs[blk.mgr_locs.indexer] = -1 + blk.delete(blk_locs) + self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk)) + + if len(removed_blknos): + # Remove blocks & update blknos accordingly + is_deleted = np.zeros(self.nblocks, dtype=np.bool_) + is_deleted[removed_blknos] = True + + new_blknos = np.empty(self.nblocks, dtype=np.int_) + new_blknos.fill(-1) + new_blknos[~is_deleted] = np.arange(self.nblocks - + len(removed_blknos)) + self._blknos = new_blknos.take(self._blknos, axis=0) + self.blocks = tuple(blk for i, blk in enumerate(self.blocks) + if i not in set(removed_blknos)) + + if unfit_val_locs: + unfit_mgr_locs = np.concatenate(unfit_mgr_locs) + unfit_count = len(unfit_mgr_locs) - # first delete from all blocks - self.delete(item) + new_blocks = [] + if value_is_sparse: + # This code (ab-)uses the fact that sparse blocks contain only + # one item. + new_blocks.extend( + make_block(values=value.copy(), ndim=self.ndim, + placement=slice(mgr_loc, mgr_loc + 1)) + for mgr_loc in unfit_mgr_locs) + + self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) + + len(self.blocks)) + self._blklocs[unfit_mgr_locs] = 0 - loc = _possibly_convert_to_indexer(loc) - for i, (l, k, arr) in enumerate(zip(loc, subset, value)): + else: + # unfit_val_locs contains BlockPlacement objects + unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:]) - # insert the item - self.insert( - l, k, arr[None, :], allow_duplicates=True) + new_blocks.append( + make_block(values=value_getitem(unfit_val_items), + ndim=self.ndim, placement=unfit_mgr_locs)) - # reset the _ref_locs on indiviual blocks - # rebuild ref_locs - if self.items.is_unique: - self._reset_ref_locs() - self._set_ref_locs(do_refs='force') + self._blknos[unfit_mgr_locs] = len(self.blocks) + self._blklocs[unfit_mgr_locs] = np.arange(unfit_count) - self._rebuild_ref_locs() + self.blocks += tuple(new_blocks) - else: - for i, (item, arr) in enumerate(zip(subset, value)): - _set_item(item, arr[None, :]) - except KeyError: - # insert at end - self.insert(len(self.items), item, value) - - self._known_consolidated = False + # Newly created block's dtype may already be present. + self._known_consolidated = False def insert(self, loc, item, value, allow_duplicates=False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? raise ValueError('cannot insert %s, already exists' % item) - try: - new_items = self.items.insert(loc, item) - self.set_items_norename(new_items) + if not isinstance(loc, int): + raise TypeError("loc must be int") - # new block - self._add_new_block(item, value, loc=loc) + block = make_block(values=value, + ndim=self.ndim, + placement=slice(loc, loc+1)) - except: - - # so our insertion operation failed, so back out of the new items - # GH 3010 - new_items = self.items.delete(loc) - self.set_items_norename(new_items) - - # re-raise - raise - - if len(self.blocks) > 100: - self._consolidate_inplace() - - self._known_consolidated = False + for blkno, count in _fast_count_smallints(self._blknos[loc:]): + blk = self.blocks[blkno] + if count == len(blk.mgr_locs): + blk.mgr_locs = blk.mgr_locs.add(1) + else: + new_mgr_locs = blk.mgr_locs.as_array.copy() + new_mgr_locs[new_mgr_locs >= loc] += 1 + blk.mgr_locs = new_mgr_locs + + if loc == self._blklocs.shape[0]: + # np.append is a lot faster (at least in numpy 1.7.1), let's use it + # if we can. + self._blklocs = np.append(self._blklocs, 0) + self._blknos = np.append(self._blknos, len(self.blocks)) + else: + self._blklocs = np.insert(self._blklocs, loc, 0) + self._blknos = np.insert(self._blknos, loc, len(self.blocks)) - # clear the internal ref_loc mappings if necessary - if loc != len(self.items) - 1 and new_items.is_unique: - self.set_items_clear(new_items) + self.axes[0] = self.items.insert(loc, item) - def set_items_norename(self, value): - self.set_axis(0, value, maybe_rename=False, check_axis=False) + self.blocks += (block,) self._shape = None - def set_items_clear(self, value): - """ clear the ref_locs on all blocks """ - self.set_axis(0, value, maybe_rename='clear', check_axis=False) - - def _delete_from_all_blocks(self, loc, item): - """ delete from the items loc the item - the item could be in multiple blocks which could - change each iteration (as we split blocks) """ - - # possibily convert to an indexer - loc = _possibly_convert_to_indexer(loc) - - if isinstance(loc, (list, tuple, np.ndarray)): - for l in loc: - for i, b in enumerate(self.blocks): - if item in b.items: - self._delete_from_block(i, item) + self._known_consolidated = False - else: - i, _ = self._find_block(item) - self._delete_from_block(i, item) + if len(self.blocks) > 100: + self._consolidate_inplace() - def _delete_from_block(self, i, item): + def reindex_axis(self, new_index, axis, method=None, limit=None, + fill_value=None, copy=True): """ - Delete and maybe remove the whole block - - Remap the split blocks to there old ranges, - so after this function, _ref_locs and _items_map (if used) - are correct for the items, None fills holes in _ref_locs + Conform block manager to new index. """ - block = self.blocks.pop(i) - ref_locs = self._set_ref_locs() - prev_items_map = self._items_map.pop( - block) if ref_locs is not None else None - - # if we can't consolidate, then we are removing this block in its - # entirey - if block._can_consolidate: - - # compute the split mask - loc = block.items.get_loc(item) - if type(loc) == slice or com.is_integer(loc): - mask = np.array([True] * len(block)) - mask[loc] = False - else: # already a mask, inverted - mask = -loc - - # split the block - counter = 0 - for s, e in com.split_ranges(mask): - - sblock = make_block(block.values[s:e], - block.items[s:e].copy(), - block.ref_items, - klass=block.__class__, - fastpath=True) - - self.blocks.append(sblock) - - # update the _ref_locs/_items_map - if ref_locs is not None: - - # fill the item_map out for this sub-block - m = maybe_create_block_in_items_map( - self._items_map, sblock) - for j, itm in enumerate(sblock.items): + new_index = _ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit, copy_if_needed=True) - # is this item masked (e.g. was deleted)? - while (True): + return self.reindex_indexer(new_index, indexer, axis=axis, + fill_value=fill_value, copy=copy) - if counter > len(mask) or mask[counter]: - break - else: - counter += 1 - - # find my mapping location - m[j] = prev_items_map[counter] - counter += 1 - - # set the ref_locs in this block - sblock.set_ref_locs(m) - - # reset the ref_locs to the new structure - if ref_locs is not None: - - # items_map is now good, with the original locations - self._set_ref_locs(do_refs=True) - - # reset the ref_locs based on the now good block._ref_locs - self._reset_ref_locs() - - def _add_new_block(self, item, value, loc=None): - # Do we care about dtype at the moment? - - # hm, elaborate hack? - if loc is None: - loc = self.items.get_loc(item) - new_block = make_block(value, self.items[loc:loc + 1].copy(), - self.items, fastpath=True) - self.blocks.append(new_block) - - # set ref_locs based on the this new block - # and add to the ref/items maps - if not self.items.is_unique: - - # insert into the ref_locs at the appropriate location - # _ref_locs is already long enough, - # but may need to shift elements - new_block.set_ref_locs([0]) - - # need to shift elements to the right - if self._ref_locs[loc] is not None: - for i in reversed(lrange(loc + 1, len(self._ref_locs))): - self._ref_locs[i] = self._ref_locs[i - 1] - - self._ref_locs[loc] = (new_block, 0) - - # and reset - self._reset_ref_locs() - self._set_ref_locs(do_refs=True) - - def _find_block(self, item): - self._check_have(item) - for i, block in enumerate(self.blocks): - if item in block: - return i, block - - def _check_have(self, item): - if item not in self.items: - raise KeyError('no item named %s' % com.pprint_thing(item)) - - def reindex_axis(self, new_axis, indexer=None, method=None, axis=0, - fill_value=None, limit=None, copy=True): - new_axis = _ensure_index(new_axis) - cur_axis = self.axes[axis] - - if new_axis.equals(cur_axis): - if copy: - result = self.copy(deep=True) - result.axes[axis] = new_axis - result._shape = None + def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, + allow_dups=False, copy=True): + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object + allow_dups : bool - if axis == 0: - # patch ref_items, #1823 - for blk in result.blocks: - blk.ref_items = new_axis + pandas-indexer with -1's only. + """ - return result - else: + if indexer is None: + if new_axis is self.axes[axis] and not copy: return self - if axis == 0: - if method is not None or limit is not None: - return self.reindex_axis0_with_method( - new_axis, indexer=indexer, method=method, - fill_value=fill_value, limit=limit, copy=copy - ) - return self.reindex_items(new_axis, indexer=indexer, copy=copy, - fill_value=fill_value) - - new_axis, indexer = cur_axis.reindex( - new_axis, method, copy_if_needed=True) - return self.reindex_indexer(new_axis, indexer, axis=axis, - fill_value=fill_value) + result = self.copy(deep=copy) + result.axes = list(self.axes) + result.axes[axis] = new_axis + return result - def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, - fill_value=None, limit=None, copy=True): - raise AssertionError('method argument not supported for ' - 'axis == 0') + self._consolidate_inplace() - def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=None, - allow_dups=False): - """ - pandas-indexer with -1's only. - """ # trying to reindex on an axis with duplicates - if not allow_dups and not self.axes[axis].is_unique and len(indexer): + if (not allow_dups and not self.axes[axis].is_unique + and len(indexer)): raise ValueError("cannot reindex from a duplicate axis") - if not self.is_consolidated(): - self = self.consolidate() + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") if axis == 0: - return self._reindex_indexer_items(new_axis, indexer, fill_value) - - new_blocks = [] - for block in self.blocks: - newb = block.reindex_axis( - indexer, axis=axis, fill_value=fill_value) - new_blocks.append(newb) + new_blocks = self._slice_take_blocks_ax0( + indexer, fill_tuple=(fill_value,)) + else: + new_blocks = [blk.take_nd(indexer, axis=axis, + fill_tuple=(fill_value if fill_value is not None else + blk.fill_value,)) + for blk in self.blocks] new_axes = list(self.axes) new_axes[axis] = new_axis return self.__class__(new_blocks, new_axes) - def _reindex_indexer_items(self, new_items, indexer, fill_value): - # TODO: less efficient than I'd like - - item_order = com.take_1d(self.items.values, indexer) - new_axes = [new_items] + self.axes[1:] - new_blocks = [] - is_unique = new_items.is_unique - - # we have duplicates in the items and what we are reindexing - if not is_unique and not self.items.is_unique: - - rl = self._set_ref_locs(do_refs='force') - for i, idx in enumerate(indexer): - item = new_items.take([i]) - if idx >= 0: - blk, lidx = rl[idx] - blk = make_block(_block_shape(blk.iget(lidx)), item, - new_items, ndim=self.ndim, fastpath=True, - placement=[i]) - - # a missing value - else: - blk = self._make_na_block(item, - new_items, - placement=[i], - fill_value=fill_value) - new_blocks.append(blk) - new_blocks = _consolidate(new_blocks, new_items) - - - # keep track of what items aren't found anywhere - else: - l = np.arange(len(item_order)) - mask = np.zeros(len(item_order), dtype=bool) - - for blk in self.blocks: - blk_indexer = blk.items.get_indexer(item_order) - selector = blk_indexer != -1 - - # update with observed items - mask |= selector - - if not selector.any(): - continue - - new_block_items = new_items.take(selector.nonzero()[0]) - new_values = com.take_nd(blk.values, blk_indexer[selector], axis=0, - allow_fill=False) - placement = l[selector] if not is_unique else None - new_blocks.append(make_block(new_values, - new_block_items, - new_items, - placement=placement, - fastpath=True)) - - if not mask.all(): - na_items = new_items[-mask] - placement = l[-mask] if not is_unique else None - na_block = self._make_na_block(na_items, - new_items, - placement=placement, - fill_value=fill_value) - new_blocks.append(na_block) - new_blocks = _consolidate(new_blocks, new_items) + def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): + """ + Slice/take blocks along axis=0. - return self.__class__(new_blocks, new_axes) + Overloaded for SingleBlock - def reindex_items(self, new_items, indexer=None, copy=True, - fill_value=None): - """ + Returns + ------- + new_blocks : list of Block """ - new_items = _ensure_index(new_items) - data = self - if not data.is_consolidated(): - data = data.consolidate() - return data.reindex_items(new_items, copy=copy, - fill_value=fill_value) - if indexer is None: - new_items, indexer = self.items.reindex(new_items, - copy_if_needed=True) - new_axes = [new_items] + self.axes[1:] + allow_fill = fill_tuple is not None - # could have so me pathological (MultiIndex) issues here - new_blocks = [] - if indexer is None: - for blk in self.blocks: - if copy: - blk = blk.reindex_items_from(new_items) - else: - blk.ref_items = new_items - new_blocks.extend(_valid_blocks(blk)) - else: + sl_type, slobj, sllen = _preprocess_slice_or_indexer( + slice_or_indexer, self.shape[0], allow_fill=allow_fill) - # unique - if self.axes[0].is_unique and new_items.is_unique: + if self._is_single_block: + blk = self.blocks[0] - # ok to use the global indexer if only 1 block - i = indexer if len(self.blocks) == 1 else None + if sl_type in ('slice', 'mask'): + return [blk.getitem_block(slobj, + new_mgr_locs=slice(0, sllen))] + elif not allow_fill or self.ndim == 1: + if allow_fill and fill_tuple[0] is None: + _, fill_value = com._maybe_promote(blk.dtype) + fill_tuple = (fill_value,) + + return [blk.take_nd(slobj, axis=0, + new_mgr_locs=slice(0, sllen), + fill_tuple=fill_tuple)] + + if sl_type in ('slice', 'mask'): + blknos = self._blknos[slobj] + blklocs = self._blklocs[slobj] + else: + blknos = com.take_1d(self._blknos, slobj, fill_value=-1, + allow_fill=allow_fill) + blklocs = com.take_1d(self._blklocs, slobj, fill_value=-1, + allow_fill=allow_fill) + + # When filling blknos, make sure blknos is updated before appending to + # blocks list, that way new blkno is exactly len(blocks). + # + # FIXME: mgr_groupby_blknos must return mgr_locs in ascending order, + # pytables serialization will break otherwise. + blocks = [] + for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks), + group=True): + if blkno == -1: + # If we've got here, fill_tuple was not None. + fill_value = fill_tuple[0] + + blocks.append(self._make_na_block( + placement=mgr_locs, fill_value=fill_value)) + else: + blk = self.blocks[blkno] - for block in self.blocks: - blk = block.reindex_items_from(new_items, indexer=i, copy=copy) - new_blocks.extend(_valid_blocks(blk)) + # Otherwise, slicing along items axis is necessary. + if blk.is_sparse: + # A sparse block, it's easy, because there's only one item + # and each mgr loc is a copy of that single item. + for mgr_loc in mgr_locs: + newblk = blk.copy(deep=True) + newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) + blocks.append(newblk) - # non-unique - else: - rl = self._set_ref_locs(do_refs='force') - for i, idx in enumerate(indexer): - blk, lidx = rl[idx] - item = new_items.take([i]) - blk = make_block(_block_shape(blk.iget(lidx)), item, - new_items, ndim=self.ndim, fastpath=True, - placement=[i]) - new_blocks.append(blk) - - # add a na block if we are missing items - mask = indexer == -1 - if mask.any(): - extra_items = new_items[mask] - na_block = self._make_na_block(extra_items, new_items, - fill_value=fill_value) - new_blocks.append(na_block) - new_blocks = _consolidate(new_blocks, new_items) - - # consolidate - # import for non-unique which creates a block for each item - # and they must be consolidated before passing on - new_blocks = _consolidate(new_blocks, new_items) + else: + blocks.append(blk.take_nd( + blklocs[mgr_locs.indexer], axis=0, + new_mgr_locs=mgr_locs, fill_tuple=None)) - return self.__class__(new_blocks, new_axes) + return blocks - def _make_na_block(self, items, ref_items, placement=None, - fill_value=None): + def _make_na_block(self, placement, fill_value=None): # TODO: infer dtypes other than float64 from fill_value if fill_value is None: fill_value = np.nan block_shape = list(self.shape) - block_shape[0] = len(items) + block_shape[0] = len(placement) dtype, fill_value = com._infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) - return make_block(block_values, items, ref_items, placement=placement) - - def take(self, indexer, new_index=None, axis=1, verify=True): - if axis < 1: - raise AssertionError('axis must be at least 1, got %d' % axis) + return make_block(block_values, placement=placement) + def take(self, indexer, axis=1, verify=True, convert=True): + """ + Take items along any axis. + """ self._consolidate_inplace() - if isinstance(indexer, list): - indexer = np.array(indexer) + indexer = np.asanyarray(indexer, dtype=np.int_) - indexer = com._ensure_platform_int(indexer) - n = len(self.axes[axis]) + n = self.shape[axis] + if convert: + indexer = _maybe_convert_indices(indexer, n) if verify: - indexer = _maybe_convert_indices(indexer, n) if ((indexer == -1) | (indexer >= n)).any(): raise Exception('Indices must be nonzero and less than ' 'the axis length') - new_axes = list(self.axes) - if new_index is None: - new_index = self.axes[axis].take(indexer) - - new_axes[axis] = new_index - return self.apply('take', - axes=new_axes, - indexer=indexer, - ref_items=new_axes[0], - new_axis=new_axes[axis], - axis=axis) - - def merge(self, other, lsuffix=None, rsuffix=None): + new_labels = self.axes[axis].take(indexer) + return self.reindex_indexer(new_axis=new_labels, indexer=indexer, + axis=axis, allow_dups=True) + + def merge(self, other, lsuffix='', rsuffix=''): if not self._is_indexed_like(other): raise AssertionError('Must have same axes to merge managers') - this, other = self._maybe_rename_join(other, lsuffix, rsuffix) - - cons_items = this.items + other.items - new_axes = list(this.axes) - new_axes[0] = cons_items - - consolidated = _consolidate(this.blocks + other.blocks, cons_items) - return self.__class__(consolidated, new_axes) + l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix, + right=other.items, rsuffix=rsuffix) + new_items = _concat_indexes([l, r]) - def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True): - to_rename = self.items.intersection(other.items) - if len(to_rename) > 0: - if not lsuffix and not rsuffix: - raise ValueError('columns overlap but no suffix specified: %s' - % to_rename) + new_blocks = [blk.copy(deep=False) + for blk in self.blocks] - def lrenamer(x): - if x in to_rename: - return '%s%s' % (x, lsuffix) - return x + offset = self.shape[0] + for blk in other.blocks: + blk = blk.copy(deep=False) + blk.mgr_locs = blk.mgr_locs.add(offset) + new_blocks.append(blk) - def rrenamer(x): - if x in to_rename: - return '%s%s' % (x, rsuffix) - return x - - this = self.rename_items(lrenamer, copy=copydata) - other = other.rename_items(rrenamer, copy=copydata) - else: - this = self + new_axes = list(self.axes) + new_axes[0] = new_items - return this, other + return self.__class__(_consolidate(new_blocks), new_axes) def _is_indexed_like(self, other): """ @@ -3558,83 +2958,6 @@ def _is_indexed_like(self, other): return False return True - def rename(self, mapper, axis, copy=False): - """ generic rename """ - - if axis == 0: - return self.rename_items(mapper, copy=copy) - return self.rename_axis(mapper, axis=axis) - - def rename_axis(self, mapper, axis=1): - - index = self.axes[axis] - if isinstance(index, MultiIndex): - new_axis = MultiIndex.from_tuples( - [tuple(mapper(y) for y in x) for x in index], - names=index.names) - else: - new_axis = Index([mapper(x) for x in index], name=index.name) - - if not new_axis.is_unique: - raise AssertionError('New axis must be unique to rename') - - new_axes = list(self.axes) - new_axes[axis] = new_axis - return self.__class__(self.blocks, new_axes) - - def rename_items(self, mapper, copy=True): - if isinstance(self.items, MultiIndex): - items = [tuple(mapper(y) for y in x) for x in self.items] - new_items = MultiIndex.from_tuples(items, names=self.items.names) - else: - items = [mapper(x) for x in self.items] - new_items = Index(items, name=self.items.name) - - new_blocks = [] - for block in self.blocks: - newb = block.copy(deep=copy) - newb.set_ref_items(new_items, maybe_rename=True) - new_blocks.append(newb) - new_axes = list(self.axes) - new_axes[0] = new_items - return self.__class__(new_blocks, new_axes) - - def add_prefix(self, prefix): - f = (('%s' % prefix) + '%s').__mod__ - return self.rename_items(f) - - def add_suffix(self, suffix): - f = ('%s' + ('%s' % suffix)).__mod__ - return self.rename_items(f) - - @property - def block_id_vector(self): - # TODO - result = np.empty(len(self.items), dtype=int) - result.fill(-1) - - for i, blk in enumerate(self.blocks): - indexer = self.items.get_indexer(blk.items) - if (indexer == -1).any(): - raise AssertionError('Block items must be in manager items') - result.put(indexer, i) - - if (result < 0).any(): - raise AssertionError('Some items were not in any block') - return result - - @property - def item_dtypes(self): - result = np.empty(len(self.items), dtype='O') - mask = np.zeros(len(self.items), dtype=bool) - for i, blk in enumerate(self.blocks): - indexer = self.items.get_indexer(blk.items) - result.put(indexer, blk.dtype.name) - mask.put(indexer, 1) - if not (mask.all()): - raise AssertionError('Some items were not in any block') - return result - def equals(self, other): self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): @@ -3646,16 +2969,16 @@ def equals(self, other): return all(block.equals(oblock) for block, oblock in zip(self.blocks, other.blocks)) -class SingleBlockManager(BlockManager): +class SingleBlockManager(BlockManager): """ manage a single block with """ + ndim = 1 _is_consolidated = True _known_consolidated = True - __slots__ = ['axes', 'blocks', '_block', - '_values', '_shape', '_has_sparse'] + __slots__ = () - def __init__(self, block, axis, do_integrity_check=False, fastpath=True): + def __init__(self, block, axis, do_integrity_check=False, fastpath=False): if isinstance(axis, list): if len(axis) != 1: @@ -3675,11 +2998,7 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): raise ValueError('Cannot create SingleBlockManager with ' 'more than 1 block') block = block[0] - if not isinstance(block, Block): - block = make_block(block, axis, axis, ndim=1, fastpath=True) - else: - self.axes = [_ensure_index(axis)] # create the block here @@ -3689,103 +3008,76 @@ def __init__(self, block, axis, do_integrity_check=False, fastpath=True): if len(block) > 1: dtype = _interleaved_dtype(block) block = [b.astype(dtype) for b in block] - block = _consolidate(block, axis) + block = _consolidate(block) if len(block) != 1: raise ValueError('Cannot create SingleBlockManager with ' 'more than 1 block') block = block[0] - if not isinstance(block, Block): - block = make_block(block, axis, axis, ndim=1, fastpath=True) + if not isinstance(block, Block): + block = make_block(block, + placement=slice(0, len(axis)), + ndim=1, fastpath=True) self.blocks = [block] - self._block = self.blocks[0] - self._values = self._block.values - self._has_sparse = self._block.is_sparse def _post_setstate(self): - self._block = self.blocks[0] - self._values = self._block.values - - def _get_counts(self, f): - return { f(self._block) : 1 } - - @property - def shape(self): - if getattr(self, '_shape', None) is None: - self._shape = tuple([len(self.axes[0])]) - return self._shape + pass - def apply(self, f, axes=None, do_integrity_check=False, **kwargs): - """ - fast path for SingleBlock Manager + @property + def _block(self): + return self.blocks[0] - ssee also BlockManager.apply - """ - applied = getattr(self._block, f)(**kwargs) - bm = self.__class__(applied, axes or self.axes, - do_integrity_check=do_integrity_check) - bm._consolidate_inplace() - return bm + @property + def _values(self): + return self._block.values def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return - if not copy and self.index.equals(new_axis): - return self + if self.index.equals(new_axis): + if copy: + return self.copy(deep=True) + else: + return self - block = self._block.reindex_items_from(new_axis, indexer=indexer, - method=method, - fill_value=fill_value, - limit=limit, copy=copy) - mgr = SingleBlockManager(block, new_axis) - mgr._consolidate_inplace() - return mgr + values = self._block.get_values() - def _reindex_indexer_items(self, new_items, indexer, fill_value): - # equiv to a reindex - return self.reindex(new_items, indexer=indexer, fill_value=fill_value, - copy=False) + if indexer is None: + indexer = self.items.get_indexer_for(new_axis) - def reindex_axis0_with_method(self, new_axis, indexer=None, method=None, - fill_value=None, limit=None, copy=True): - return self.reindex(new_axis, indexer=indexer, method=method, - fill_value=fill_value, limit=limit, copy=copy) + if fill_value is None: + # FIXME: is fill_value used correctly in sparse blocks? + if not self._block.is_sparse: + fill_value = self._block.fill_value + else: + fill_value = np.nan - def _delete_from_block(self, i, item): - super(SingleBlockManager, self)._delete_from_block(i, item) + new_values = com.take_1d(values, indexer, + fill_value=fill_value) - # possibly need to merge split blocks - if len(self.blocks) > 1: - new_items = Index(list(itertools.chain(*[ b.items for b in self.blocks ]))) - block = make_block(np.concatenate([ b.values for b in self.blocks ]), - new_items, - new_items, - dtype=self._block.dtype) + # fill if needed + if method is not None or limit is not None: + new_values = com.interpolate_2d(new_values, method=method, + limit=limit, fill_value=fill_value) - elif len(self.blocks): - block = self.blocks[0] - else: - block = make_block(np.array([], dtype=self._block.dtype), [], []) + if self._block.is_sparse: + make_block = self._block.make_block_same_class - self.blocks = [block] - self._block = block - self._values = self._block.values + block = make_block(new_values, copy=copy, + placement=slice(0, len(new_axis))) - def get_slice(self, slobj): - return self.__class__(self._block._slice(slobj), - self.index[slobj], fastpath=True) + mgr = SingleBlockManager(block, new_axis) + mgr._consolidate_inplace() + return mgr - def set_axis(self, axis, value, maybe_rename=True, check_axis=True): - cur_axis, value = self._set_axis(axis, value, check_axis) - self._block.set_ref_items(self.items, maybe_rename=maybe_rename) + def get_slice(self, slobj, axis=0): + if axis >= self.ndim: + raise IndexError("Requested axis not found in manager") - def set_ref_items(self, ref_items, maybe_rename=True): - """ we can optimize and our ref_locs are always equal to ref_items """ - if maybe_rename: - self.items = ref_items - self.ref_items = ref_items + return self.__class__(self._block._slice(slobj), + self.index[slobj], fastpath=True) @property def index(self): @@ -3804,6 +3096,18 @@ def dtype(self): def ftype(self): return self._block.ftype + def get_dtype_counts(self): + return {self.dtype.name: 1} + + def get_ftype_counts(self): + return {self.ftype: 1} + + def get_dtypes(self): + return np.array([self._block.dtype]) + + def get_ftypes(self): + return np.array([self._block.ftype]) + @property def values(self): return self._values.view() @@ -3825,6 +3129,16 @@ def _consolidate_check(self): def _consolidate_inplace(self): pass + def delete(self, item): + """ + Delete single item from SingleBlockManager. + + Ensures that self.blocks doesn't become empty. + """ + loc = self.items.get_loc(item) + self._block.delete(loc) + self.axes[0] = self.axes[0].delete(loc) + def fast_xs(self, loc): """ fast path for getting a cross-section @@ -3832,6 +3146,7 @@ def fast_xs(self, loc): """ return self._block.values[loc] + def construction_error(tot_items, block_shape, axes, e=None): """ raise a helpful message about our construction """ passed = tuple(map(int, [tot_items] + list(block_shape))) @@ -3841,14 +3156,15 @@ def construction_error(tot_items, block_shape, axes, e=None): raise ValueError("Shape of passed values is {0}, indices imply {1}".format( passed,implied)) + def create_block_manager_from_blocks(blocks, axes): try: - - # if we are passed values, make the blocks if len(blocks) == 1 and not isinstance(blocks[0], Block): - placement = None if axes[0].is_unique else np.arange(len(axes[0])) - blocks = [ - make_block(blocks[0], axes[0], axes[0], placement=placement)] + # It's OK if a single block is passed as values, its placement is + # basically "all items", but if there're many, don't bother + # converting, it's an error anyway. + blocks = [make_block(values=blocks[0], + placement=slice(0, len(axes[0])))] mgr = BlockManager(blocks, axes) mgr._consolidate_inplace() @@ -3870,26 +3186,7 @@ def create_block_manager_from_arrays(arrays, names, axes): construction_error(len(arrays), arrays[0].shape[1:], axes, e) -def maybe_create_block_in_items_map(im, block): - """ create/return the block in an items_map """ - try: - return im[block] - except: - im[block] = l = [None] * len(block.items) - return l - - def form_blocks(arrays, names, axes): - - # pre-filter out items if we passed it - items = axes[0] - - if len(arrays) < len(items): - nn = set(names) - extra_items = Index([i for i in items if i not in nn]) - else: - extra_items = [] - # put "leftover" items in float bucket, where else? # generalize? float_items = [] @@ -3899,8 +3196,23 @@ def form_blocks(arrays, names, axes): object_items = [] sparse_items = [] datetime_items = [] + extra_locs = [] + + names_idx = Index(names) + if names_idx.equals(axes[0]): + names_indexer = np.arange(len(names_idx)) + else: + assert names_idx.intersection(axes[0]).is_unique + names_indexer = names_idx.get_indexer_for(axes[0]) + + for i, name_idx in enumerate(names_indexer): + if name_idx == -1: + extra_locs.append(i) + continue + + k = names[name_idx] + v = arrays[name_idx] - for i, (k, v) in enumerate(zip(names, arrays)): if isinstance(v, (SparseArray, ABCSparseSeries)): sparse_items.append((i, k, v)) elif issubclass(v.dtype.type, np.floating): @@ -3927,72 +3239,67 @@ def form_blocks(arrays, names, axes): else: object_items.append((i, k, v)) - is_unique = items.is_unique blocks = [] if len(float_items): - float_blocks = _multi_blockify(float_items, items, is_unique=is_unique) + float_blocks = _multi_blockify(float_items) blocks.extend(float_blocks) if len(complex_items): complex_blocks = _simple_blockify( - complex_items, items, np.complex128, is_unique=is_unique) + complex_items, np.complex128) blocks.extend(complex_blocks) if len(int_items): - int_blocks = _multi_blockify(int_items, items, is_unique=is_unique) + int_blocks = _multi_blockify(int_items) blocks.extend(int_blocks) if len(datetime_items): datetime_blocks = _simple_blockify( - datetime_items, items, _NS_DTYPE, is_unique=is_unique) + datetime_items, _NS_DTYPE) blocks.extend(datetime_blocks) if len(bool_items): bool_blocks = _simple_blockify( - bool_items, items, np.bool_, is_unique=is_unique) + bool_items, np.bool_) blocks.extend(bool_blocks) if len(object_items) > 0: object_blocks = _simple_blockify( - object_items, items, np.object_, is_unique=is_unique) + object_items, np.object_) blocks.extend(object_blocks) if len(sparse_items) > 0: - sparse_blocks = _sparse_blockify(sparse_items, items) + sparse_blocks = _sparse_blockify(sparse_items) blocks.extend(sparse_blocks) - if len(extra_items): - shape = (len(extra_items),) + tuple(len(x) for x in axes[1:]) + if len(extra_locs): + shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) # empty items -> dtype object block_values = np.empty(shape, dtype=object) block_values.fill(np.nan) - placement = None if is_unique else np.arange(len(extra_items)) - na_block = make_block( - block_values, extra_items, items, placement=placement) + na_block = make_block(block_values, placement=extra_locs) blocks.append(na_block) return blocks -def _simple_blockify(tuples, ref_items, dtype, is_unique=True): +def _simple_blockify(tuples, dtype): """ return a single array of a block that has a single dtype; if dtype is not None, coerce to this dtype """ - block_items, values, placement = _stack_arrays(tuples, ref_items, dtype) + values, placement = _stack_arrays(tuples, dtype) # CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) - if is_unique: - placement = None - block = make_block(values, block_items, ref_items, placement=placement) + block = make_block(values, placement=placement) return [block] -def _multi_blockify(tuples, ref_items, dtype=None, is_unique=True): +def _multi_blockify(tuples, dtype=None): """ return an array of blocks that potentially have different dtypes """ # group by dtype @@ -4001,37 +3308,32 @@ def _multi_blockify(tuples, ref_items, dtype=None, is_unique=True): new_blocks = [] for dtype, tup_block in grouper: - block_items, values, placement = _stack_arrays( - list(tup_block), ref_items, dtype) - if is_unique: - placement = None - block = make_block(values, block_items, ref_items, placement=placement) + values, placement = _stack_arrays( + list(tup_block), dtype) + + block = make_block(values, placement=placement) new_blocks.append(block) return new_blocks -def _sparse_blockify(tuples, ref_items, dtype=None): +def _sparse_blockify(tuples, dtype=None): """ return an array of blocks that potentially have different dtypes (and are sparse) """ new_blocks = [] for i, names, array in tuples: - - if not isinstance(names, (list, tuple)): - names = [names] - items = ref_items[ref_items.isin(names)] - array = _maybe_to_sparse(array) block = make_block( - array, items, ref_items, klass=SparseBlock, fastpath=True) + array, klass=SparseBlock, fastpath=True, + placement=[i]) new_blocks.append(block) return new_blocks -def _stack_arrays(tuples, ref_items, dtype): +def _stack_arrays(tuples, dtype): # fml def _asarray_compat(x): @@ -4055,33 +3357,7 @@ def _shape_compat(x): for i, arr in enumerate(arrays): stacked[i] = _asarray_compat(arr) - # index may box values - if ref_items.is_unique: - items = ref_items[ref_items.isin(names)] - else: - # a mi - if isinstance(ref_items, MultiIndex): - names = MultiIndex.from_tuples(names) - items = ref_items[ref_items.isin(names)] - - # plain old dups - else: - items = _ensure_index([n for n in names if n in ref_items]) - if len(items) != len(stacked): - raise ValueError("invalid names passed _stack_arrays") - - return items, stacked, placement - - -def _blocks_to_series_dict(blocks, index=None): - from pandas.core.series import Series - - series_dict = {} - - for block in blocks: - for item, vec in zip(block.items, block.values): - series_dict[item] = Series(vec, index=index, name=item) - return series_dict + return stacked, placement def _interleaved_dtype(blocks): @@ -4143,7 +3419,7 @@ def _lcd_dtype(l): return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) -def _consolidate(blocks, items): +def _consolidate(blocks): """ Merge blocks having same dtype, exclude non-consolidating blocks """ @@ -4154,7 +3430,7 @@ def _consolidate(blocks, items): new_blocks = [] for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks(list(group_blocks), items, dtype=dtype, + merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype, _can_consolidate=_can_consolidate) if isinstance(merged_blocks, list): new_blocks.extend(merged_blocks) @@ -4164,14 +3440,7 @@ def _consolidate(blocks, items): return new_blocks -def _valid_blocks(newb): - if newb is None: - return [] - if not isinstance(newb, list): - newb = [ newb ] - return [ b for b in newb if len(b.items) > 0 ] - -def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True): +def _merge_blocks(blocks, dtype=None, _can_consolidate=True): if len(blocks) == 1: return blocks[0] @@ -4182,22 +3451,17 @@ def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True): raise AssertionError("_merge_blocks are invalid!") dtype = blocks[0].dtype - if not items.is_unique: - blocks = sorted(blocks, key=lambda b: b.ref_locs.tolist()) - + # FIXME: optimization potential in case all mgrs contain slices and + # combination of those slices is a slice, too. + new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) new_values = _vstack([b.values for b in blocks], dtype) - new_items = blocks[0].items.append([b.items for b in blocks[1:]]) - new_block = make_block(new_values, new_items, items) - # unique, can reindex - if items.is_unique: - return new_block.reindex_items_from(items) + argsort = np.argsort(new_mgr_locs) + new_values = new_values[argsort] + new_mgr_locs = new_mgr_locs[argsort] - # merge the ref_locs - new_ref_locs = [b._ref_locs for b in blocks] - if all([x is not None for x in new_ref_locs]): - new_block.set_ref_locs(np.concatenate(new_ref_locs)) - return new_block + return make_block(new_values, + fastpath=True, placement=new_mgr_locs) # no merge return blocks @@ -4223,14 +3487,6 @@ def _vstack(to_stack, dtype): return np.vstack(to_stack) -def _possibly_convert_to_indexer(loc): - if com._is_bool_indexer(loc): - loc = [i for i, v in enumerate(loc) if v] - elif isinstance(loc, slice): - loc = lrange(loc.start, loc.stop) - return loc - - def _possibly_compare(a, b, op): res = op(a, b) is_a_array = isinstance(a, np.ndarray) @@ -4246,3 +3502,594 @@ def _possibly_compare(a, b, op): raise TypeError("Cannot compare types %r and %r" % tuple(type_names)) return res + + + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + + +def _invert_reordering(reordering, minlength=None): + """ + Invert reordering operation. + + Given array `reordering`, make `reordering_inv` of it, such that:: + + reordering_inv[reordering[x]] = x + + There are two types of indexers: + + source + is when element *s* at position *i* means that values to fill *i-th* + item of reindex operation should be taken from *s-th* item of the + original (this is what is returned by `pandas.Index.reindex`). + destination + is when element *d* at position *i* means that values from *i-th* item + of source should be used to fill *d-th* item of reindexing operation. + + This function will convert from *source* to *destination* and vice-versa. + + .. note:: trailing ``-1`` may be lost upon conversion (this is what + `minlength` is there for). + + .. note:: if *source* indexer is not unique, corresponding *destination* + indexer will have ``dtype=object`` and will contain lists. + + Examples: + + >>> _invert_reordering([3, -1, 2, 4, -1]) + array([-1, -1, 2, 0, 3]) + >>> _invert_reordering([-1, -1, 0, 2, 3]) + array([3, -1, 2, 4]) + >>> _invert_reordering([1,3,5]) + array([-1, 0, -1, 1, -1, 2]) + + """ + reordering = np.asanyarray(reordering) + if not com.is_integer_dtype(reordering): + raise ValueError("Only integer indexers are supported") + + nonneg_indices = reordering[reordering >= 0] + counts = np.bincount(nonneg_indices, minlength=minlength) + has_non_unique = (counts > 1).any() + + dtype = np.dtype(np.object_) if has_non_unique else np.dtype(np.int_) + inverted = np.empty_like(counts, dtype=dtype) + inverted.fill(-1) + + nonneg_positions = np.arange(len(reordering), dtype=np.int_)[reordering >= 0] + np.put(inverted, nonneg_indices, nonneg_positions) + + if has_non_unique: + nonunique_elements = np.arange(len(counts))[counts > 1] + for elt in nonunique_elements: + inverted[elt] = nonneg_positions[nonneg_indices == elt].tolist() + + return inverted + + +def _get_blkno_placements(blknos, blk_count, group=True): + """ + + Parameters + ---------- + blknos : array of int64 + blk_count : int + group : bool + + Returns + ------- + iterator + yield (BlockPlacement, blkno) + + """ + + # FIXME: blk_count is unused, but it may avoid the use of dicts in cython + for blkno, indexer in lib.get_blkno_indexers(blknos, group): + yield blkno, BlockPlacement(indexer) + + +def items_overlap_with_suffix(left, lsuffix, right, rsuffix): + """ + If two indices overlap, add suffixes to overlapping entries. + + If corresponding suffix is empty, the entry is simply converted to string. + + """ + to_rename = left.intersection(right) + if len(to_rename) == 0: + return left, right + else: + if not lsuffix and not rsuffix: + raise ValueError('columns overlap but no suffix specified: %s' % + to_rename) + + def lrenamer(x): + if x in to_rename: + return '%s%s' % (x, lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '%s%s' % (x, rsuffix) + return x + + return (_transform_index(left, lrenamer), + _transform_index(right, rrenamer)) + + +def _transform_index(index, func): + """ + Apply function to all values found in index. + + This includes transforming multiindex entries separately. + + """ + if isinstance(index, MultiIndex): + items = [tuple(func(y) for y in x) for x in index] + return MultiIndex.from_tuples(items, names=index.names) + else: + items = [func(x) for x in index] + return Index(items, name=index.name) + + +def _putmask_smart(v, m, n): + """ + Return a new block, try to preserve dtype if possible. + + Parameters + ---------- + v : array_like + m : array_like + n : array_like + """ + + # n should be the length of the mask or a scalar here + if not is_list_like(n): + n = np.array([n] * len(m)) + + # see if we are only masking values that if putted + # will work in the current dtype + try: + nn = n[m] + nn_at = nn.astype(v.dtype) + if (nn == nn_at).all(): + nv = v.copy() + nv[m] = nn_at + return nv + except (ValueError, IndexError, TypeError): + pass + + # change the dtype + dtype, _ = com._maybe_promote(n.dtype) + nv = v.astype(dtype) + try: + nv[m] = n + except ValueError: + idx, = np.where(np.squeeze(m)) + for mask_index, new_val in zip(idx, n): + nv[mask_index] = new_val + return nv + + +def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): + """ + Concatenate block managers into one. + + Parameters + ---------- + mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples + axes : list of Index + concat_axis : int + copy : bool + + """ + concat_plan = combine_concat_plans([get_mgr_concatenation_plan(mgr, indexers) + for mgr, indexers in mgrs_indexers], + concat_axis) + + blocks = [make_block(concatenate_join_units(join_units, concat_axis, + copy=copy), + placement=placement) + for placement, join_units in concat_plan] + + return BlockManager(blocks, axes) + + +def get_empty_dtype_and_na(join_units): + """ + Return dtype and N/A values to use when concatenating specified units. + + Returned N/A value may be None which means there was no casting involved. + + Returns + ------- + dtype + na + """ + + if len(join_units) == 1: + blk = join_units[0].block + if blk is None: + return np.float64, np.nan + else: + return blk.dtype, None + + has_none_blocks = False + dtypes = [None] * len(join_units) + + for i, unit in enumerate(join_units): + if unit.block is None: + has_none_blocks = True + else: + dtypes[i] = unit.dtype + + if not has_none_blocks and len(set(dtypes)) == 1: + # Unanimous decision, nothing to upcast. + return dtypes[0], None + + # dtypes = set() + upcast_classes = set() + null_upcast_classes = set() + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + if issubclass(dtype.type, (np.object_, np.bool_)): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + else: + upcast_cls = 'float' + + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_null: + null_upcast_classes.add(upcast_cls) + else: + upcast_classes.add(upcast_cls) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + # create the result + if 'object' in upcast_classes: + return np.dtype(np.object_), np.nan + elif 'float' in upcast_classes: + return np.dtype(np.float64), np.nan + elif 'datetime' in upcast_classes: + return np.dtype('M8[ns]'), tslib.iNaT + elif 'timedelta' in upcast_classes: + return np.dtype('m8[ns]'), tslib.iNaT + else: # pragma + raise AssertionError("invalid dtype determination in get_concat_dtype") + + +def concatenate_join_units(join_units, concat_axis, copy): + """ + Concatenate values from several join units along selected axis. + """ + if concat_axis == 0 and len(join_units) > 1: + # Concatenating join units along ax0 is handled in _merge_blocks. + raise AssertionError("Concatenating join units along axis0") + + empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units) + + to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype, + upcasted_na=upcasted_na) + for ju in join_units] + + if len(to_concat) == 1: + # Only one block, nothing to concatenate. + concat_values = to_concat[0] + if copy and concat_values.base is not None: + concat_values = concat_values.copy() + else: + concat_values = com._concat_compat(to_concat, axis=concat_axis) + + # FIXME: optimization potential: if len(join_units) == 1, single join unit + # is densified and sparsified back. + if any(unit.is_sparse for unit in join_units): + # If one of the units was sparse, concat_values are 2d and there's only + # one item. + return SparseArray(concat_values[0]) + else: + return concat_values + + +def get_mgr_concatenation_plan(mgr, indexers): + """ + Construct concatenation plan for given block manager and indexers. + + Parameters + ---------- + mgr : BlockManager + indexers : dict of {axis: indexer} + + Returns + ------- + plan : list of (BlockPlacement, JoinUnit) tuples + + """ + # Calculate post-reindex shape , save for item axis which will be separate + # for each block anyway. + mgr_shape = list(mgr.shape) + for ax, indexer in indexers.items(): + mgr_shape[ax] = len(indexer) + mgr_shape = tuple(mgr_shape) + + if 0 in indexers: + ax0_indexer = indexers.pop(0) + blknos = com.take_1d(mgr._blknos, ax0_indexer, fill_value=-1) + blklocs = com.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1) + else: + + if mgr._is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + + ax0_indexer = None + blknos = mgr._blknos + blklocs = mgr._blklocs + + plan = [] + for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks), + group=False): + assert placements.is_slice_like + + join_unit_indexers = indexers.copy() + + shape = list(mgr_shape) + shape[0] = len(placements) + shape = tuple(shape) + + if blkno == -1: + unit = JoinUnit(None, shape) + else: + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) and + # Fastpath detection of join unit not needing to reindex its + # block: no ax0 reindexing took place and block placement was + # sequential before. + ((ax0_indexer is None + and blk.mgr_locs.is_slice_like + and blk.mgr_locs.as_slice.step == 1) or + # Slow-ish detection: all indexer locs are sequential (and + # length match is checked above). + (np.diff(ax0_blk_indexer) == 1).all())) + + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer + + unit = JoinUnit(blk, shape, join_unit_indexers) + + plan.append((placements, unit)) + + return plan + + +def combine_concat_plans(plans, concat_axis): + """ + Combine multiple concatenation plans into one. + + existing_plan is updated in-place. + """ + if len(plans) == 1: + for p in plans[0]: + yield p[0], [p[1]] + + elif concat_axis == 0: + offset = 0 + for plan in plans: + last_plc = None + + for plc, unit in plan: + yield plc.add(offset), [unit] + last_plc = plc + + if last_plc is not None: + offset += last_plc.as_slice.stop + + else: + num_ended = [0] + def _next_or_none(seq): + retval = next(seq, None) + if retval is None: + num_ended[0] += 1 + return retval + + plans = list(map(iter, plans)) + next_items = list(map(_next_or_none, plans)) + + while num_ended[0] != len(next_items): + if num_ended[0] > 0: + raise ValueError("Plan shapes are not aligned") + + placements, units = zip(*next_items) + + lengths = list(map(len, placements)) + min_len, max_len = min(lengths), max(lengths) + + if min_len == max_len: + yield placements[0], units + next_items[:] = map(_next_or_none, plans) + else: + yielded_placement = None + yielded_units = [None] * len(next_items) + for i, (plc, unit) in enumerate(next_items): + yielded_units[i] = unit + if len(plc) > min_len: + # trim_join_unit updates unit in place, so only + # placement needs to be sliced to skip min_len. + next_items[i] = (plc[min_len:], + trim_join_unit(unit, min_len)) + else: + yielded_placement = plc + next_items[i] = _next_or_none(plans[i]) + + yield yielded_placement, yielded_units + + +def trim_join_unit(join_unit, length): + """ + Reduce join_unit's shape along item axis to length. + + Extra items that didn't fit are returned as a separate block. + """ + + if 0 not in join_unit.indexers: + extra_indexers = join_unit.indexers + + if join_unit.block is None: + extra_block = None + else: + extra_block = join_unit.block.getitem_block(slice(length, None)) + join_unit.block = join_unit.block.getitem_block(slice(length)) + else: + extra_block = join_unit.block + + extra_indexers = copy.copy(join_unit.indexers) + extra_indexers[0] = extra_indexers[0][length:] + join_unit.indexers[0] = join_unit.indexers[0][:length] + + extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:] + join_unit.shape = (length,) + join_unit.shape[1:] + + return JoinUnit(block=extra_block, indexers=extra_indexers, + shape=extra_shape) + + +class JoinUnit(object): + def __init__(self, block, shape, indexers={}): + # Passing shape explicitly is required for cases when block is None. + self.block = block + self.indexers = indexers + self.shape = shape + + def __repr__(self): + return '%s(%r, %s)' % (self.__class__.__name__, + self.block, self.indexers) + + @cache_readonly + def needs_filling(self): + for indexer in self.indexers.values(): + # FIXME: cache results of indexer == -1 checks. + if (indexer == -1).any(): + return True + + return False + + @cache_readonly + def dtype(self): + if self.block is None: + raise AssertionError("Block is None, no dtype") + + if not self.needs_filling: + return self.block.dtype + else: + return np.dtype(com._maybe_promote(self.block.dtype, + self.block.fill_value)[0]) + return self._dtype + + @cache_readonly + def is_null(self): + if self.block is None: + return True + + if not self.block._can_hold_na: + return False + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. 1000 value + # was chosen rather arbitrarily. + values_flat = self.block.values.ravel() + total_len = values_flat.shape[0] + chunk_len = max(total_len // 40, 1000) + for i in range(0, total_len, chunk_len): + if not isnull(values_flat[i: i + chunk_len]).all(): + return False + + return True + + @cache_readonly + def is_sparse(self): + return self.block is not None and self.block.is_sparse + + def get_reindexed_values(self, empty_dtype, upcasted_na): + if upcasted_na is None: + # No upcasting is necessary + fill_value = self.block.fill_value + values = self.block.get_values() + else: + fill_value = upcasted_na + + if self.is_null: + missing_arr = np.empty(self.shape, dtype=empty_dtype) + if np.prod(self.shape): + # NumPy 1.6 workaround: this statement gets strange if all + # blocks are of same dtype and some of them are empty: + # empty one are considered "null" so they must be filled, + # but no dtype upcasting happens and the dtype may not + # allow NaNs. + # + # In general, no one should get hurt when one tries to put + # incorrect values into empty array, but numpy 1.6 is + # strict about that. + missing_arr.fill(fill_value) + return missing_arr + + if self.block.is_bool: + # External code requested filling/upcasting, bool values must + # be upcasted to object to avoid being upcasted to numeric. + values = self.block.astype(np.object_).values + else: + # No dtype upcasting is done here, it will be performed during + # concatenation itself. + values = self.block.get_values() + + if not self.indexers: + # If there's no indexing to be done, we want to signal outside + # code that this array must be copied explicitly. This is done + # by returning a view and checking `retval.base`. + return values.view() + else: + for ax, indexer in self.indexers.items(): + values = com.take_nd(values, indexer, axis=ax, + fill_value=fill_value) + + return values + + +def _fast_count_smallints(arr): + """Faster version of set(arr) for sequences of small numbers.""" + if len(arr) == 0: + # Handle empty arr case separately: numpy 1.6 chokes on that. + return np.empty((0, 2), dtype=arr.dtype) + else: + counts = np.bincount(arr) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] + + +def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): + if isinstance(slice_or_indexer, slice): + return 'slice', slice_or_indexer, lib.slice_len(slice_or_indexer, + length) + elif (isinstance(slice_or_indexer, np.ndarray) and + slice_or_indexer.dtype == np.bool_): + return 'mask', slice_or_indexer, slice_or_indexer.sum() + else: + indexer = np.asanyarray(slice_or_indexer, dtype=np.int_) + if not allow_fill: + indexer = _maybe_convert_indices(indexer, length) + return 'fancy', indexer, len(indexer) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7dc266617c5fd..196b80a83723f 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -447,15 +447,17 @@ def _unstack_frame(obj, level): new_blocks = [] mask_blocks = [] for blk in obj._data.blocks: + blk_items = obj._data.items[blk.mgr_locs.indexer] bunstacker = _Unstacker(blk.values.T, obj.index, level=level, - value_columns=blk.items) + value_columns=blk_items) new_items = bunstacker.get_new_columns() + new_placement = new_columns.get_indexer(new_items) new_values, mask = bunstacker.get_new_values() - mblk = make_block(mask.T, new_items, new_columns) + mblk = make_block(mask.T, placement=new_placement) mask_blocks.append(mblk) - newb = make_block(new_values.T, new_items, new_columns) + newb = make_block(new_values.T, placement=new_placement) new_blocks.append(newb) result = DataFrame(BlockManager(new_blocks, new_axes)) @@ -1071,10 +1073,11 @@ def make_axis_dummies(frame, axis='minor', transform=None): return DataFrame(values, columns=items, index=frame.index) -def block2d_to_blocknd(values, items, shape, labels, ref_items=None): +def block2d_to_blocknd(values, placement, shape, labels, ref_items): """ pivot to the labels shape """ from pandas.core.internals import make_block - panel_shape = (len(items),) + shape + + panel_shape = (len(placement),) + shape # TODO: lexsort depth needs to be 2!! @@ -1092,13 +1095,10 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): pvalues.fill(fill_value) values = values - for i in range(len(items)): + for i in range(len(placement)): pvalues[i].flat[mask] = values[:, i] - if ref_items is None: - ref_items = items - - return make_block(pvalues, items, ref_items) + return make_block(pvalues, placement=placement) def factor_indexer(shape, labels): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 105bea92124fd..7da86565b51cd 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -356,7 +356,7 @@ def encode(obj): return {'typ': 'block_manager', 'klass': obj.__class__.__name__, 'axes': data.axes, - 'blocks': [{'items': b.items, + 'blocks': [{'items': data.items.take(b.mgr_locs), 'values': convert(b.values), 'shape': b.values.shape, 'dtype': b.dtype.num, @@ -481,10 +481,11 @@ def decode(obj): axes = obj['axes'] def create_block(b): - dtype = dtype_for(b['dtype']) - return make_block(unconvert(b['values'], dtype, b['compress']) - .reshape(b['shape']), b['items'], axes[0], - klass=getattr(internals, b['klass'])) + values = unconvert(b['values'], dtype_for(b['dtype']), + b['compress']).reshape(b['shape']) + return make_block(values=values, + klass=getattr(internals, b['klass']), + placement=axes[0].get_indexer(b['items'])) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 27298e52e3186..95daa2bbc2752 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1704,11 +1704,11 @@ def set_kind(self): if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, + def set_atom(self, block, block_items, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ - self.values = list(block.items) + self.values = list(block_items) dtype = block.dtype.name rvalues = block.values.ravel() inferred_type = lib.infer_dtype(rvalues) @@ -1763,7 +1763,7 @@ def set_atom(self, block, existing_col, min_itemsize, # end up here ### elif inferred_type == 'string' or dtype == 'object': self.set_atom_string( - block, + block, block_items, existing_col, min_itemsize, nan_rep, @@ -1776,8 +1776,8 @@ def set_atom(self, block, existing_col, min_itemsize, def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string( - self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string(self, block, block_items, existing_col, min_itemsize, + nan_rep, encoding): # fill nan items with myself, don't disturb the blocks by # trying to downcast block = block.fillna(nan_rep, downcast=False)[0] @@ -1789,9 +1789,9 @@ def set_atom_string( # we cannot serialize this data, so report an exception on a column # by column basis - for item in block.items: + for i, item in enumerate(block_items): - col = block.get(item) + col = block.iget(i) inferred_type = lib.infer_dtype(col.ravel()) if inferred_type != 'string': raise TypeError( @@ -2649,7 +2649,8 @@ def read(self, **kwargs): for i in range(self.nblocks): blk_items = self.read_index('block%d_items' % i) values = self.read_array('block%d_values' % i) - blk = make_block(values, blk_items, items) + blk = make_block(values, + placement=items.get_indexer(blk_items)) blocks.append(blk) return self.obj_type(BlockManager(blocks, axes)) @@ -2665,12 +2666,12 @@ def write(self, obj, **kwargs): self.write_index('axis%d' % i, ax) # Supporting mixed-type DataFrame objects...nontrivial - self.attrs.nblocks = nblocks = len(data.blocks) - for i in range(nblocks): - blk = data.blocks[i] + self.attrs.nblocks = len(data.blocks) + for i, blk in enumerate(data.blocks): # I have no idea why, but writing values before items fixed #2299 - self.write_array('block%d_values' % i, blk.values, items=blk.items) - self.write_index('block%d_items' % i, blk.items) + blk_items = data.items.take(blk.mgr_locs) + self.write_array('block%d_values' % i, blk.values, items=blk_items) + self.write_index('block%d_items' % i, blk_items) class FrameFixed(BlockManagerFixed): @@ -3190,51 +3191,63 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, for a in self.non_index_axes: obj = _reindex_axis(obj, a[0], a[1]) + def get_blk_items(mgr, blocks): + return [mgr.items.take(blk.mgr_locs) for blk in blocks] + # figure out data_columns and get out blocks block_obj = self.get_object(obj).consolidate() blocks = block_obj._data.blocks + blk_items = get_blk_items(block_obj._data, blocks) if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] data_columns = self.validate_data_columns( data_columns, min_itemsize) if len(data_columns): - blocks = block_obj.reindex_axis( + mgr = block_obj.reindex_axis( Index(axis_labels) - Index(data_columns), axis=axis - )._data.blocks + )._data + + blocks = list(mgr.blocks) + blk_items = get_blk_items(mgr, blocks) for c in data_columns: - blocks.extend( - block_obj.reindex_axis([c], axis=axis)._data.blocks) + mgr = block_obj.reindex_axis([c], axis=axis)._data + blocks.extend(mgr.blocks) + blk_items.extend(get_blk_items(mgr, mgr.blocks)) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict([(tuple(b.items.tolist()), b) for b in blocks]) + by_items = dict([(tuple(b_items.tolist()), (b, b_items)) + for b, b_items in zip(blocks, blk_items)]) new_blocks = [] + new_blk_items = [] for ea in existing_table.values_axes: items = tuple(ea.values) try: - b = by_items.pop(items) + b, b_items = by_items.pop(items) new_blocks.append(b) + new_blk_items.append(b_items) except: raise ValueError( "cannot match existing table structure for [%s] on " "appending data" % ','.join(com.pprint_thing(item) for item in items)) blocks = new_blocks + blk_items = new_blk_items # add my values self.values_axes = [] - for i, b in enumerate(blocks): + for i, (b, b_items) in enumerate(zip(blocks, blk_items)): # shape of the data column are the indexable axes klass = DataCol name = None # we have a data_column - if (data_columns and len(b.items) == 1 and - b.items[0] in data_columns): + if (data_columns and len(b_items) == 1 and + b_items[0] in data_columns): klass = DataIndexableCol - name = b.items[0] + name = b_items[0] self.data_columns.append(name) # make sure that we match up the existing columns @@ -3252,7 +3265,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, try: col = klass.create_for_block( i=i, name=name, version=self.version) - col.set_atom(block=b, + col.set_atom(block=b, block_items=b_items, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, @@ -3268,7 +3281,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, raise Exception( "cannot find the correct atom type -> " "[dtype->%s,items->%s] %s" - % (b.dtype.name, b.items, str(detail)) + % (b.dtype.name, b_items, str(detail)) ) j += 1 @@ -3490,7 +3503,8 @@ def read(self, where=None, columns=None, **kwargs): take_labels = [l.take(sorter) for l in labels] items = Index(c.values) block = block2d_to_blocknd( - sorted_values, items, tuple(N), take_labels) + values=sorted_values, placement=np.arange(len(items)), + shape=tuple(N), labels=take_labels, ref_items=items) # create the object mgr = BlockManager([block], [items] + levels) @@ -3823,7 +3837,7 @@ def read(self, where=None, columns=None, **kwargs): if values.ndim == 1: values = values.reshape(1, values.shape[0]) - block = make_block(values, cols_, cols_) + block = make_block(values, placement=np.arange(len(cols_))) mgr = BlockManager([block], [cols_, index_]) frames.append(DataFrame(mgr)) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index b70248d1ef3f4..3054b75ce56ac 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -83,7 +83,6 @@ def test_read_pickles_0_13_0(self): self.read_pickles('0.13.0') def test_round_trip_current(self): - for typ, dv in self.data.items(): for dt, expected in dv.items(): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9c9d20e51be64..90c2681b837e8 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -3504,7 +3504,6 @@ def test_invalid_filtering(self): self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']") def test_string_select(self): - # GH 2973 with ensure_clean_store(self.path) as store: diff --git a/pandas/lib.pyx b/pandas/lib.pyx index a1fef095ea277..0bac4f8011420 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -19,6 +19,17 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyTuple_New, PyObject_SetAttrString) +cdef extern from "Python.h": + ctypedef struct PySliceObject: + pass + + cdef int PySlice_GetIndicesEx( + PySliceObject* s, Py_ssize_t length, + Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step, + Py_ssize_t *slicelength) except -1 + + + cimport cpython isnan = np.isnan @@ -1232,6 +1243,419 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, return result + +@cython.boundscheck(False) +@cython.wraparound(False) +def get_blkno_indexers(int64_t[:] blknos, bint group=True): + """ + Enumerate contiguous runs of integers in ndarray. + + Iterate over elements of `blknos` yielding ``(blkno, slice(start, stop))`` + pairs for each contiguous run found. + + If `group` is True and there is more than one run for a certain blkno, + ``(blkno, array)`` with an array containing positions of all elements equal + to blkno. + + Returns + ------- + iter : iterator of (int, slice or array) + + """ + # There's blkno in this function's name because it's used in block & + # blockno handling. + cdef: + int64_t cur_blkno + Py_ssize_t i, start, stop, n, diff + + list group_order + dict group_slices + int64_t[:] res_view + + n = blknos.shape[0] + + if n > 0: + start = 0 + cur_blkno = blknos[start] + + if group == False: + for i in range(1, n): + if blknos[i] != cur_blkno: + yield cur_blkno, slice(start, i) + + start = i + cur_blkno = blknos[i] + + yield cur_blkno, slice(start, n) + else: + group_order = [] + group_dict = {} + + for i in range(1, n): + if blknos[i] != cur_blkno: + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, i)] + else: + group_dict[cur_blkno].append((start, i)) + + start = i + cur_blkno = blknos[i] + + if cur_blkno not in group_dict: + group_order.append(cur_blkno) + group_dict[cur_blkno] = [(start, n)] + else: + group_dict[cur_blkno].append((start, n)) + + for blkno in group_order: + slices = group_dict[blkno] + if len(slices) == 1: + yield blkno, slice(slices[0][0], slices[0][1]) + else: + tot_len = sum(stop - start for start, stop in slices) + result = np.empty(tot_len, dtype=np.int64) + res_view = result + + i = 0 + for start, stop in slices: + for diff in range(start, stop): + res_view[i] = diff + i += 1 + + yield blkno, result + + +@cython.boundscheck(False) +@cython.wraparound(False) +cpdef slice indexer_as_slice(int64_t[:] vals): + cdef: + Py_ssize_t i, n, start, stop + int64_t d + + if vals is None: + raise TypeError("vals must be ndarray") + + n = vals.shape[0] + + if n == 0 or vals[0] < 0: + return None + + if n == 1: + return slice(vals[0], vals[0] + 1, 1) + + if vals[1] < 0: + return None + + # n > 2 + d = vals[1] - vals[0] + + if d == 0: + return None + + for i in range(2, n): + if vals[i] < 0 or vals[i] - vals[i-1] != d: + return None + + start = vals[0] + stop = start + n * d + if stop < 0 and d < 0: + return slice(start, None, d) + else: + return slice(start, stop, d) + + +cpdef slice_canonize(slice s): + """ + Convert slice to canonical bounded form. + """ + cdef: + Py_ssize_t start, stop, step, length + + if s.step is None: + step = 1 + else: + step = s.step + if step == 0: + raise ValueError("slice step cannot be zero") + + if step > 0: + if s.stop is None: + raise ValueError("unbounded slice") + + stop = s.stop + if s.start is None: + start = 0 + else: + start = s.start + if start > stop: + start = stop + elif step < 0: + if s.start is None: + raise ValueError("unbounded slice") + + start = s.start + if s.stop is None: + stop = -1 + else: + stop = s.stop + if stop > start: + stop = start + + if start < 0 or (stop < 0 and s.stop is not None): + raise ValueError("unbounded slice") + + if stop < 0: + return slice(start, None, step) + else: + return slice(start, stop, step) + + +cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=INT64_MAX): + """ + Get (start, stop, step, length) tuple for a slice. + + If `objlen` is not specified, slice must be bounded, otherwise the result + will be wrong. + + """ + cdef: + Py_ssize_t start, stop, step, length + + if slc is None: + raise TypeError("slc should be a slice") + + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) + return start, stop, step, length + + +cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=INT64_MAX) except -1: + """ + Get length of a bounded slice. + + The slice must not have any "open" bounds that would create dependency on + container size, i.e.: + - if ``s.step is None or s.step > 0``, ``s.stop`` is not ``None`` + - if ``s.step < 0``, ``s.start`` is not ``None`` + + Otherwise, the result is unreliable. + + """ + cdef: + Py_ssize_t start, stop, step, length + + if slc is None: + raise TypeError("slc must be slice") + + PySlice_GetIndicesEx(slc, objlen, + &start, &stop, &step, &length) + + return length + + +def slice_getitem(slice slc not None, ind): + cdef: + Py_ssize_t s_start, s_stop, s_step, s_len + Py_ssize_t ind_start, ind_stop, ind_step, ind_len + + s_start, s_stop, s_step, s_len = slice_get_indices_ex(slc) + + if isinstance(ind, slice): + ind_start, ind_stop, ind_step, ind_len = slice_get_indices_ex(ind, + s_len) + + if ind_step > 0 and ind_len == s_len: + # short-cut for no-op slice + if ind_len == s_len: + return slc + + if ind_step < 0: + s_start = s_stop - s_step + ind_step = -ind_step + + s_step *= ind_step + s_stop = s_start + ind_stop * s_step + s_start = s_start + ind_start * s_step + + if s_step < 0 and s_stop < 0: + return slice(s_start, None, s_step) + else: + return slice(s_start, s_stop, s_step) + + else: + return np.arange(s_start, s_stop, s_step)[ind] + + +cdef class BlockPlacement: + # __slots__ = '_as_slice', '_as_array', '_len' + cdef slice _as_slice + cdef object _as_array + + cdef bint _has_slice, _has_array, _is_known_slice_like + + def __init__(self, val): + cdef slice slc + + self._has_slice = False + self._has_array = False + + if isinstance(val, slice): + slc = slice_canonize(val) + + if slc.start != slc.stop: + self._as_slice = slc + self._has_slice = True + else: + arr = np.empty(0, dtype=np.int64) + self._as_array = arr + self._has_array = True + else: + # Cython memoryview interface requires ndarray to be writeable. + arr = np.require(val, dtype=np.int64, requirements='W') + assert arr.ndim == 1 + self._as_array = arr + self._has_array = True + + def __unicode__(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + v = self._as_slice + else: + v = self._as_array + + return '%s(%r)' % (self.__class__.__name__, v) + + def __len__(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return slice_len(s) + else: + return len(self._as_array) + + def __iter__(self): + cdef slice s = self._ensure_has_slice() + cdef Py_ssize_t start, stop, step, _ + if s is not None: + start, stop, step, _ = slice_get_indices_ex(s) + return iter(range(start, stop, step)) + else: + return iter(self._as_array) + + @property + def as_slice(self): + cdef slice s = self._ensure_has_slice() + if s is None: + raise TypeError('Not slice-like') + else: + return s + + @property + def indexer(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return s + else: + return self._as_array + + def isin(self, arr): + from pandas.core.index import Int64Index + return Int64Index(self.as_array, copy=False).isin(arr) + + @property + def as_array(self): + cdef Py_ssize_t start, stop, end, _ + if not self._has_array: + start, stop, step, _ = slice_get_indices_ex(self._as_slice) + self._as_array = np.arange(start, stop, step, + dtype=np.int_) + self._has_array = True + return self._as_array + + @property + def is_slice_like(self): + cdef slice s = self._ensure_has_slice() + return s is not None + + def __getitem__(self, loc): + cdef slice s = self._ensure_has_slice() + if s is not None: + val = slice_getitem(s, loc) + else: + val = self._as_array[loc] + + if not isinstance(val, slice) and val.ndim == 0: + return val + + return BlockPlacement(val) + + def delete(self, loc): + return BlockPlacement(np.delete(self.as_array, loc, axis=0)) + + def append(self, others): + if len(others) == 0: + return self + + return BlockPlacement(np.concatenate([self.as_array] + + [o.as_array for o in others])) + + cdef iadd(self, other): + cdef slice s = self._ensure_has_slice() + cdef Py_ssize_t other_int, start, stop, step, l + + if isinstance(other, int) and s is not None: + other_int = other + + if other_int == 0: + return self + + start, stop, step, l = slice_get_indices_ex(s) + start += other_int + stop += other_int + + if ((step > 0 and start < 0) or + (step < 0 and stop < step)): + raise ValueError("iadd causes length change") + + if stop < 0: + self._as_slice = slice(start, None, step) + else: + self._as_slice = slice(start, stop, step) + + self._has_array = False + self._as_array = None + else: + newarr = self.as_array + other + if (newarr < 0).any(): + raise ValueError("iadd causes length change") + + self._as_array = newarr + self._has_array = True + self._has_slice = False + self._as_slice = None + + return self + + cdef BlockPlacement copy(self): + cdef slice s = self._ensure_has_slice() + if s is not None: + return BlockPlacement(s) + else: + return BlockPlacement(self._as_array) + + def add(self, other): + return self.copy().iadd(other) + + def sub(self, other): + return self.add(-other) + + cdef slice _ensure_has_slice(self): + if not self._has_slice: + self._as_slice = indexer_as_slice(self._as_array) + self._has_slice = True + return self._as_slice + + include "reduce.pyx" include "properties.pyx" include "inference.pyx" diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 1c599653f9fc5..48576266c3b5f 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -541,7 +541,7 @@ def sparse_reindex(self, new_index): raise TypeError('new index must be a SparseIndex') block = self.block.sparse_reindex(new_index) - new_data = SingleBlockManager(block, block.ref_items) + new_data = SingleBlockManager(block, self.index) return self._constructor(new_data, index=self.index, sparse_index=new_index, fill_value=self.fill_value).__finalize__(self) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 7696353dca6f1..3a2f8adf719e4 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1023,7 +1023,7 @@ def _compare_to_dense(a, b, da, db, op): for op in ops: _compare_to_dense(frame, frame[::2], frame.to_dense(), frame[::2].to_dense(), op) - for s in series: + for i, s in enumerate(series): _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), op) _compare_to_dense(s, frame, s.to_dense(), diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3a3d5a822163f..2aac364d16770 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -106,7 +106,6 @@ class CheckIndexing(object): def test_getitem(self): # slicing - sl = self.frame[:20] self.assertEqual(20, len(sl.index)) @@ -120,7 +119,7 @@ def test_getitem(self): self.assertIsNotNone(self.frame[key]) self.assertNotIn('random', self.frame) - with assertRaisesRegexp(KeyError, 'no item named random'): + with assertRaisesRegexp(KeyError, 'random'): self.frame['random'] df = self.frame.copy() @@ -2723,6 +2722,11 @@ def test_constructor_corner(self): df = DataFrame({}, columns=['foo', 'bar']) self.assertEqual(df.values.dtype, np.object_) + df = DataFrame({'b': 1}, index=lrange(10), columns=list('abc'), + dtype=int) + self.assertEqual(df.values.dtype, np.object_) + + def test_constructor_scalar_inference(self): data = {'int': 1, 'bool': True, 'float': 3., 'complex': 4j, 'object': 'foo'} @@ -3341,7 +3345,6 @@ def test_column_dups2(self): assert_frame_equal(result, expected) def test_column_dups_indexing(self): - def check(result, expected=None): if expected is not None: assert_frame_equal(result,expected) @@ -7804,11 +7807,11 @@ def test_regex_replace_dict_mixed(self): # scalar -> dict # to_replace regex, {value: value} + expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': + mix['c']}) res = dfmix.replace('a', {'b': nan}, regex=True) res2 = dfmix.copy() res2.replace('a', {'b': nan}, regex=True, inplace=True) - expec = DataFrame({'a': mix['a'], 'b': [nan, 'b', '.', '.'], 'c': - mix['c']}) assert_frame_equal(res, expec) assert_frame_equal(res2, expec) @@ -8645,7 +8648,6 @@ def test_reindex_dups(self): self.assertRaises(ValueError, df.reindex, index=list(range(len(df)))) def test_align(self): - af, bf = self.frame.align(self.frame) self.assertIsNot(af._data, self.frame._data) @@ -9789,7 +9791,7 @@ def test_reorder_levels(self): assert_frame_equal(result, expected) def test_sort_index(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) # axis=0 @@ -11820,8 +11822,8 @@ def test_columns_with_dups(self): df_dt = DataFrame(Timestamp('20010101'),index=df_float.index,columns=df_float.columns) df = pd.concat([ df_float, df_int, df_bool, df_object, df_dt ], axis=1) - result = df._data._set_ref_locs() - self.assertEqual(len(result), len(df.columns)) + self.assertEqual(len(df._data._blknos), len(df.columns)) + self.assertEqual(len(df._data._blklocs), len(df.columns)) # testing iget for i in range(len(df.columns)): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 261e1dd2a590c..a105b17795398 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -1015,7 +1015,7 @@ def test_iloc_getitem_doc_issue(self): columns = list(range(0,8,2)) df = DataFrame(arr,index=index,columns=columns) - df._data.blocks[0].ref_locs + df._data.blocks[0].mgr_locs result = df.iloc[1:5,2:4] str(result) result.dtypes diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 2c9c8a94a1902..b91384a840c33 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -4,6 +4,7 @@ import numpy as np from pandas import Index, MultiIndex, DataFrame, Series +from pandas.compat import OrderedDict from pandas.sparse.array import SparseArray from pandas.core.internals import * import pandas.core.internals as internals @@ -17,89 +18,159 @@ def assert_block_equal(left, right): assert_almost_equal(left.values, right.values) assert(left.dtype == right.dtype) - assert(left.items.equals(right.items)) - assert(left.ref_items.equals(right.ref_items)) + assert_almost_equal(left.mgr_locs, right.mgr_locs) -def get_float_mat(n, k, dtype): - return np.repeat(np.atleast_2d(np.arange(k, dtype=dtype)), n, axis=0) - -TEST_COLS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 's1', 's2'] -N = 10 - - -def get_float_ex(cols=['a', 'c', 'e'], dtype = np.float_): - floats = get_float_mat(N, len(cols), dtype = dtype).T - return make_block(floats, cols, TEST_COLS) - - -def get_complex_ex(cols=['h']): - complexes = (get_float_mat(N, 1, dtype = np.float_).T * 1j).astype(np.complex128) - return make_block(complexes, cols, TEST_COLS) - - -def get_obj_ex(cols=['b', 'd']): - mat = np.empty((N, 2), dtype=object) - mat[:, 0] = 'foo' - mat[:, 1] = 'bar' - return make_block(mat.T, cols, TEST_COLS) - -def get_bool_ex(cols=['f']): - mat = np.ones((N, 1), dtype=bool) - return make_block(mat.T, cols, TEST_COLS) +def get_numeric_mat(shape): + arr = np.arange(shape[0]) + return np.lib.stride_tricks.as_strided( + x=arr, shape=shape, + strides=(arr.itemsize,) + (0,) * (len(shape) - 1)).copy() -def get_int_ex(cols=['g'], dtype = np.int_): - mat = randn(N, 1).astype(dtype) - return make_block(mat.T, cols, TEST_COLS) - +N = 10 -def get_dt_ex(cols=['h']): - mat = randn(N, 1).astype(int).astype('M8[ns]') - return make_block(mat.T, cols, TEST_COLS) -def get_sparse_ex1(): - sa1 = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) - return make_block(sa1, ['s1'], TEST_COLS) +def create_block(typestr, placement, item_shape=None, num_offset=0): + """ + Supported typestr: + + * float, f8, f4, f2 + * int, i8, i4, i2, i1 + * uint, u8, u4, u2, u1 + * complex, c16, c8 + * bool + * object, string, O + * datetime, dt + * sparse (SparseArray with fill_value=0.0) + * sparse_na (SparseArray with fill_value=np.nan) + + """ + placement = BlockPlacement(placement) + num_items = len(placement) + + if item_shape is None: + item_shape = (N,) + + shape = (num_items,) + item_shape + + mat = get_numeric_mat(shape) + + if typestr in ('float', 'f8', 'f4', 'f2', + 'int', 'i8', 'i4', 'i2', 'i1', + 'uint', 'u8', 'u4', 'u2', 'u1'): + values = mat.astype(typestr) + num_offset + elif typestr in ('complex', 'c16', 'c8'): + values = 1.j * (mat.astype(typestr) + num_offset) + elif typestr in ('object', 'string', 'O'): + values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], + shape) + elif typestr in ('bool'): + values = np.ones(shape, dtype=np.bool_) + elif typestr in ('datetime', 'dt'): + values = (mat * 1e9).astype('M8[ns]') + elif typestr in ('sparse', 'sparse_na'): + # FIXME: doesn't support num_rows != 10 + assert shape[-1] == 10 + assert all(s == 1 for s in shape[:-1]) + if typestr.endswith('_na'): + fill_value = np.nan + else: + fill_value = 0.0 + values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value, + 4, 5, fill_value, 6], fill_value=fill_value) + arr = values.sp_values.view() + arr += (num_offset - 1) + else: + raise ValueError('Unsupported typestr: "%s"' % typestr) + + return make_block(values, placement=placement, ndim=len(shape)) + + +def create_single_mgr(typestr, num_rows=None): + if num_rows is None: + num_rows = N + + return SingleBlockManager( + create_block(typestr, placement=slice(0, num_rows), item_shape=()), + np.arange(num_rows)) + + +def create_mgr(descr, item_shape=None): + """ + Construct BlockManager from string description. + + String description syntax looks similar to np.matrix initializer. It looks + like this:: + + a,b,c: f8; d,e,f: i8 + + Rules are rather simple: + + * see list of supported datatypes in `create_block` method + * components are semicolon-separated + * each component is `NAME,NAME,NAME: DTYPE_ID` + * whitespace around colons & semicolons are removed + * components with same DTYPE_ID are combined into single block + * to force multiple blocks with same dtype, use '-SUFFIX':: + + 'a:f8-1; b:f8-2; c:f8-foobar' + + """ + if item_shape is None: + item_shape = (N,) + + offset = 0 + mgr_items = [] + block_placements = OrderedDict() + for d in descr.split(';'): + d = d.strip() + names, blockstr = d.partition(':')[::2] + blockstr = blockstr.strip() + names = names.strip().split(',') + + mgr_items.extend(names) + placement = list(np.arange(len(names)) + offset) + try: + block_placements[blockstr].extend(placement) + except KeyError: + block_placements[blockstr] = placement + offset += len(names) -def get_sparse_ex2(): - sa2 = SparseArray([0, 0, 2, 3, 4, 0, 6, 7, 0, 8], fill_value=0) - return make_block(sa2, ['s2'], TEST_COLS) + mgr_items = Index(mgr_items) -def create_blockmanager(blocks): - l = [] - for b in blocks: - l.extend(b.items) - items = Index(l) - for b in blocks: - b.ref_items = items + blocks = [] + num_offset = 0 + for blockstr, placement in block_placements.items(): + typestr = blockstr.split('-')[0] + blocks.append(create_block(typestr, placement, item_shape=item_shape, + num_offset=num_offset,)) + num_offset += len(placement) - index_sz = blocks[0].shape[1] - return BlockManager(blocks, [items, np.arange(index_sz)]) + return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]), + [mgr_items] + [np.arange(n) for n in item_shape]) -def create_singleblockmanager(blocks): - l = [] - for b in blocks: - l.extend(b.items) - items = Index(l) - for b in blocks: - b.ref_items = items - return SingleBlockManager(blocks, [items]) class TestBlock(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.fblock = get_float_ex() - self.cblock = get_complex_ex() - self.oblock = get_obj_ex() - self.bool_block = get_bool_ex() - self.int_block = get_int_ex() + # self.fblock = get_float_ex() # a,c,e + # self.cblock = get_complex_ex() # + # self.oblock = get_obj_ex() + # self.bool_block = get_bool_ex() + # self.int_block = get_int_ex() + + self.fblock = create_block('float', [0, 2, 4]) + self.cblock = create_block('complex', [7]) + self.oblock = create_block('object', [1, 3]) + self.bool_block = create_block('bool', [5]) + self.int_block = create_block('int', [6]) def test_constructor(self): - int32block = get_int_ex(['a'],dtype = np.int32) + int32block = create_block('i4', [0]) self.assertEqual(int32block.dtype, np.int32) def test_pickle(self): @@ -115,8 +186,8 @@ def _check(blk): _check(self.oblock) _check(self.bool_block) - def test_ref_locs(self): - assert_almost_equal(self.fblock.ref_locs, [0, 2, 4]) + def test_mgr_locs(self): + assert_almost_equal(self.fblock.mgr_locs, [0, 2, 4]) def test_attrs(self): self.assertEqual(self.fblock.shape, self.fblock.values.shape) @@ -127,16 +198,16 @@ def test_merge(self): avals = randn(2, 10) bvals = randn(2, 10) - ref_cols = ['e', 'a', 'b', 'd', 'f'] + ref_cols = Index(['e', 'a', 'b', 'd', 'f']) - ablock = make_block(avals, ['e', 'b'], ref_cols) - bblock = make_block(bvals, ['a', 'd'], ref_cols) + ablock = make_block(avals, + ref_cols.get_indexer(['e', 'b'])) + bblock = make_block(bvals, + ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) - exvals = np.vstack((avals, bvals)) - excols = ['e', 'b', 'a', 'd'] - eblock = make_block(exvals, excols, ref_cols) - eblock = eblock.reindex_items_from(ref_cols) - assert_block_equal(merged, eblock) + assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) + assert_almost_equal(merged.values[[0, 2]], avals) + assert_almost_equal(merged.values[[1, 3]], bvals) # TODO: merge with mixed type? @@ -145,29 +216,9 @@ def test_copy(self): self.assertIsNot(cop, self.fblock) assert_block_equal(self.fblock, cop) - def test_items(self): - cols = self.fblock.items - self.assert_numpy_array_equal(cols, ['a', 'c', 'e']) - - cols2 = self.fblock.items - self.assertIs(cols, cols2) - - def test_assign_ref_items(self): - new_cols = Index(['foo', 'bar', 'baz', 'quux', 'hi']) - self.fblock.set_ref_items(new_cols) - self.assert_numpy_array_equal(self.fblock.items, ['foo', 'baz', 'hi']) - def test_reindex_index(self): pass - def test_reindex_items_from(self): - new_cols = Index(['e', 'b', 'c', 'f']) - reindexed = self.fblock.reindex_items_from(new_cols) - assert_almost_equal(reindexed.ref_locs, [0, 2]) - self.assertEquals(reindexed.values.shape[0], 2) - self.assert_((reindexed.values[0] == 2).all()) - self.assert_((reindexed.values[1] == 1).all()) - def test_reindex_cast(self): pass @@ -175,19 +226,23 @@ def test_insert(self): pass def test_delete(self): - newb = self.fblock.delete('a') - assert_almost_equal(newb.ref_locs, [2, 4]) + newb = self.fblock.copy() + newb.delete(0) + assert_almost_equal(newb.mgr_locs, [2, 4]) self.assert_((newb.values[0] == 1).all()) - newb = self.fblock.delete('c') - assert_almost_equal(newb.ref_locs, [0, 4]) + newb = self.fblock.copy() + newb.delete(1) + assert_almost_equal(newb.mgr_locs, [0, 4]) self.assert_((newb.values[1] == 2).all()) - newb = self.fblock.delete('e') - assert_almost_equal(newb.ref_locs, [0, 2]) + newb = self.fblock.copy() + newb.delete(2) + assert_almost_equal(newb.mgr_locs, [0, 2]) self.assert_((newb.values[1] == 1).all()) - self.assertRaises(Exception, self.fblock.delete, 'b') + newb = self.fblock.copy() + self.assertRaises(Exception, newb.delete, 3) def test_split_block_at(self): @@ -212,13 +267,6 @@ def test_split_block_at(self): bs = list(bblock.split_block_at('f')) self.assertEqual(len(bs), 0) - def test_unicode_repr(self): - mat = np.empty((N, 2), dtype=object) - mat[:, 0] = 'foo' - mat[:, 1] = 'bar' - cols = ['b', u("\u05d0")] - str_repr = repr(make_block(mat.T, cols, TEST_COLS)) - def test_get(self): pass @@ -233,76 +281,52 @@ def test_repr(self): class TestBlockManager(tm.TestCase): - _multiprocess_can_split_ = True def setUp(self): - self.blocks = [get_float_ex(), - get_obj_ex(), - get_bool_ex(), - get_int_ex(), - get_complex_ex()] - - all_items = [b.items for b in self.blocks] - - items = sorted(all_items[0].append(all_items[1:])) - items = Index(items) - for b in self.blocks: - b.ref_items = items - - self.mgr = BlockManager(self.blocks, [items, np.arange(N)]) + self.mgr = create_mgr('a: f8; b: object; c: f8; d: object; e: f8;' + 'f: bool; g: i8; h: complex') def test_constructor_corner(self): pass def test_attrs(self): - self.assertEquals(self.mgr.nblocks, len(self.mgr.blocks)) - self.assertEquals(len(self.mgr), len(self.mgr.items)) + mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2') + self.assertEquals(mgr.nblocks, 2) + self.assertEquals(len(mgr), 6) def test_is_mixed_dtype(self): - self.assertTrue(self.mgr.is_mixed_type) + self.assertFalse(create_mgr('a,b:f8').is_mixed_type) + self.assertFalse(create_mgr('a:f8-1; b:f8-2').is_mixed_type) - mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) - self.assertFalse(mgr.is_mixed_type) + self.assertTrue(create_mgr('a,b:f8; c,d: f4').is_mixed_type) + self.assertTrue(create_mgr('a,b:f8; c,d: object').is_mixed_type) def test_is_indexed_like(self): - self.assertTrue(self.mgr._is_indexed_like(self.mgr)) - mgr2 = self.mgr.reindex_axis(np.arange(N - 1), axis=1) - self.assertFalse(self.mgr._is_indexed_like(mgr2)) - - def test_block_id_vector_item_dtypes(self): - expected = [0, 1, 0, 1, 0, 2, 3, 4] - result = self.mgr.block_id_vector - assert_almost_equal(expected, result) - - result = self.mgr.item_dtypes - - # as the platform may not exactly match this, pseudo match - expected = ['float64', 'object', 'float64', 'object', 'float64', - 'bool', 'int64', 'complex128'] - for e, r in zip(expected, result): - np.dtype(e).kind == np.dtype(r).kind - - def test_duplicate_item_failure(self): - items = Index(['a', 'a']) - blocks = [get_bool_ex(['a']), get_float_ex(['a'])] - for b in blocks: - b.ref_items = items - - # test trying to create _ref_locs with/o ref_locs set on the blocks - self.assertRaises(AssertionError, BlockManager, blocks, [items, np.arange(N)]) - - blocks[0].set_ref_locs([0]) - blocks[1].set_ref_locs([1]) - mgr = BlockManager(blocks, [items, np.arange(N)]) - mgr.iget(1) + mgr1 = create_mgr('a,b: f8') + mgr2 = create_mgr('a:i8; b:bool') + mgr3 = create_mgr('a,b,c: f8') + self.assertTrue(mgr1._is_indexed_like(mgr1)) + self.assertTrue(mgr1._is_indexed_like(mgr2)) + self.assertTrue(mgr1._is_indexed_like(mgr3)) + + self.assertFalse(mgr1._is_indexed_like( + mgr1.get_slice(slice(-1), axis=1))) + + def test_duplicate_ref_loc_failure(self): + tmp_mgr = create_mgr('a:bool; a: f8') + + axes, blocks = tmp_mgr.axes, tmp_mgr.blocks - # invalidate the _ref_locs - for b in blocks: - b._ref_locs = None - mgr._ref_locs = None - mgr._items_map = None - self.assertRaises(AssertionError, mgr._set_ref_locs, do_refs=True) + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([0]) + # test trying to create block manager with overlapping ref locs + self.assertRaises(AssertionError, BlockManager, blocks, axes) + + blocks[0].mgr_locs = np.array([0]) + blocks[1].mgr_locs = np.array([1]) + mgr = BlockManager(blocks, axes) + mgr.iget(1) def test_contains(self): self.assertIn('a', self.mgr) @@ -318,7 +342,7 @@ def test_pickle(self): assert_frame_equal(DataFrame(self.mgr), DataFrame(mgr2)) # share ref_items - self.assertIs(mgr2.blocks[0].ref_items, mgr2.blocks[1].ref_items) + # self.assertIs(mgr2.blocks[0].ref_items, mgr2.blocks[1].ref_items) # GH2431 self.assertTrue(hasattr(mgr2, "_is_consolidated")) @@ -328,9 +352,6 @@ def test_pickle(self): self.assertFalse(mgr2._is_consolidated) self.assertFalse(mgr2._known_consolidated) - def test_get(self): - pass - def test_get_scalar(self): for item in self.mgr.items: for i, index in enumerate(self.mgr.axes[1]): @@ -338,8 +359,35 @@ def test_get_scalar(self): exp = self.mgr.get(item)[i] assert_almost_equal(res, exp) + def test_get(self): + cols = Index(list('abc')) + values = np.random.rand(3, 3) + block = make_block(values=values.copy(), + placement=np.arange(3)) + mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) + + assert_almost_equal(mgr.get('a'), values[0]) + assert_almost_equal(mgr.get('b'), values[1]) + assert_almost_equal(mgr.get('c'), values[2]) + def test_set(self): - pass + mgr = create_mgr('a,b,c: int', item_shape=(3,)) + + mgr.set('d', np.array(['foo'] * 3)) + mgr.set('b', np.array(['bar'] * 3)) + assert_almost_equal(mgr.get('a'), [0] * 3) + assert_almost_equal(mgr.get('b'), ['bar'] * 3) + assert_almost_equal(mgr.get('c'), [2] * 3) + assert_almost_equal(mgr.get('d'), ['foo'] * 3) + + def test_insert(self): + self.mgr.insert(0, 'inserted', np.arange(N)) + + self.assertEqual(self.mgr.items[0], 'inserted') + assert_almost_equal(self.mgr.get('inserted'), np.arange(N)) + + for blk in self.mgr.blocks: + yield self.assertIs, self.mgr.items, blk.ref_items def test_set_change_dtype(self): self.mgr.set('baz', np.zeros(N, dtype=bool)) @@ -370,58 +418,68 @@ def test_copy(self): self.assertTrue(found) def test_sparse(self): - mgr = create_blockmanager([get_sparse_ex1(),get_sparse_ex2()]) + mgr = create_mgr('a: sparse-1; b: sparse-2') # what to test here? self.assertEqual(mgr.as_matrix().dtype, np.float64) def test_sparse_mixed(self): - mgr = create_blockmanager([get_sparse_ex1(),get_sparse_ex2(),get_float_ex()]) + mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8') self.assertEqual(len(mgr.blocks), 3) self.assertIsInstance(mgr, BlockManager) # what to test here? def test_as_matrix_float(self): - - mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)]) + mgr = create_mgr('c: f4; d: f2; e: f8') self.assertEqual(mgr.as_matrix().dtype, np.float64) - mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16)]) + mgr = create_mgr('c: f4; d: f2') self.assertEqual(mgr.as_matrix().dtype, np.float32) def test_as_matrix_int_bool(self): - - mgr = create_blockmanager([get_bool_ex(['a']), get_bool_ex(['b'])]) + mgr = create_mgr('a: bool-1; b: bool-2') self.assertEqual(mgr.as_matrix().dtype, np.bool_) - mgr = create_blockmanager([get_int_ex(['a'],np.int64), get_int_ex(['b'],np.int64), get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ]) + mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1') self.assertEqual(mgr.as_matrix().dtype, np.int64) - mgr = create_blockmanager([get_int_ex(['c'],np.int32), get_int_ex(['d'],np.int16), get_int_ex(['e'],np.uint8) ]) + mgr = create_mgr('c: i4; d: i2; e: u1') self.assertEqual(mgr.as_matrix().dtype, np.int32) def test_as_matrix_datetime(self): - mgr = create_blockmanager([get_dt_ex(['h']), get_dt_ex(['g'])]) + mgr = create_mgr('h: datetime-1; g: datetime-2') self.assertEqual(mgr.as_matrix().dtype, 'M8[ns]') def test_astype(self): - # coerce all - mgr = create_blockmanager([get_float_ex(['c'],np.float32), get_float_ex(['d'],np.float16), get_float_ex(['e'],np.float64)]) - - for t in ['float16','float32','float64','int32','int64']: + mgr = create_mgr('c: f4; d: f2; e: f8') + for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + t = np.dtype(t) tmgr = mgr.astype(t) - self.assertEqual(tmgr.as_matrix().dtype, np.dtype(t)) + self.assertEqual(tmgr.get('c').dtype.type, t) + self.assertEqual(tmgr.get('d').dtype.type, t) + self.assertEqual(tmgr.get('e').dtype.type, t) # mixed - mgr = create_blockmanager([get_obj_ex(['a','b']),get_bool_ex(['c']),get_dt_ex(['d']),get_float_ex(['e'],np.float32), get_float_ex(['f'],np.float16), get_float_ex(['g'],np.float64)]) - for t in ['float16','float32','float64','int32','int64']: - tmgr = mgr.astype(t, raise_on_error = False).get_numeric_data() - self.assertEqual(tmgr.as_matrix().dtype, np.dtype(t)) + mgr = create_mgr('a,b: object; c: bool; d: datetime;' + 'e: f4; f: f2; g: f8') + for t in ['float16', 'float32', 'float64', 'int32', 'int64']: + t = np.dtype(t) + tmgr = mgr.astype(t, raise_on_error=False) + self.assertEqual(tmgr.get('c').dtype.type, t) + self.assertEqual(tmgr.get('e').dtype.type, t) + self.assertEqual(tmgr.get('f').dtype.type, t) + self.assertEqual(tmgr.get('g').dtype.type, t) + + self.assertEqual(tmgr.get('a').dtype.type, np.object_) + self.assertEqual(tmgr.get('b').dtype.type, np.object_) + if t != np.int64: + self.assertEqual(tmgr.get('d').dtype.type, np.datetime64) + else: + self.assertEqual(tmgr.get('d').dtype.type, t) def test_convert(self): - def _compare(old_mgr, new_mgr): """ compare the blocks, numeric compare ==, object don't """ old_blocks = set(old_mgr.blocks) @@ -446,45 +504,41 @@ def _compare(old_mgr, new_mgr): self.assertTrue(found) # noops - mgr = create_blockmanager([get_int_ex(['f']), get_float_ex(['g'])]) + mgr = create_mgr('f: i8; g: f8') new_mgr = mgr.convert() _compare(mgr,new_mgr) - mgr = create_blockmanager([get_obj_ex(['a','b']), get_int_ex(['f']), get_float_ex(['g'])]) + mgr = create_mgr('a, b: object; f: i8; g: f8') new_mgr = mgr.convert() _compare(mgr,new_mgr) - # there could atcually be multiple dtypes resulting - def _check(new_mgr,block_type, citems): - items = set() - for b in new_mgr.blocks: - if isinstance(b,block_type): - for i in list(b.items): - items.add(i) - self.assertEqual(items, set(citems)) - # convert - mat = np.empty((N, 3), dtype=object) - mat[:, 0] = '1' - mat[:, 1] = '2.' - mat[:, 2] = 'foo' - b = make_block(mat.T, ['a','b','foo'], TEST_COLS) - - mgr = create_blockmanager([b, get_int_ex(['f']), get_float_ex(['g'])]) - new_mgr = mgr.convert(convert_numeric = True) - - _check(new_mgr,FloatBlock,['b','g']) - _check(new_mgr,IntBlock,['a','f']) - - mgr = create_blockmanager([b, get_int_ex(['f'],np.int32), get_bool_ex(['bool']), get_dt_ex(['dt']), - get_int_ex(['i'],np.int64), get_float_ex(['g'],np.float64), get_float_ex(['h'],np.float16)]) - new_mgr = mgr.convert(convert_numeric = True) - - _check(new_mgr,FloatBlock,['b','g','h']) - _check(new_mgr,IntBlock,['a','f','i']) - _check(new_mgr,ObjectBlock,['foo']) - _check(new_mgr,BoolBlock,['bool']) - _check(new_mgr,DatetimeBlock,['dt']) + mgr = create_mgr('a,b,foo: object; f: i8; g: f8') + mgr.set('a', np.array(['1'] * N, dtype=np.object_)) + mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) + mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + new_mgr = mgr.convert(convert_numeric=True) + self.assertEquals(new_mgr.get('a').dtype.type, np.int64) + self.assertEquals(new_mgr.get('b').dtype.type, np.float64) + self.assertEquals(new_mgr.get('foo').dtype.type, np.object_) + self.assertEquals(new_mgr.get('f').dtype.type, np.int64) + self.assertEquals(new_mgr.get('g').dtype.type, np.float64) + + mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;' + 'i: i8; g: f8; h: f2') + mgr.set('a', np.array(['1'] * N, dtype=np.object_)) + mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) + mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) + new_mgr = mgr.convert(convert_numeric=True) + self.assertEquals(new_mgr.get('a').dtype.type, np.int64) + self.assertEquals(new_mgr.get('b').dtype.type, np.float64) + self.assertEquals(new_mgr.get('foo').dtype.type, np.object_) + self.assertEquals(new_mgr.get('f').dtype.type, np.int32) + self.assertEquals(new_mgr.get('bool').dtype.type, np.bool_) + self.assertEquals(new_mgr.get('dt').dtype.type, np.datetime64) + self.assertEquals(new_mgr.get('i').dtype.type, np.int64) + self.assertEquals(new_mgr.get('g').dtype.type, np.float64) + self.assertEquals(new_mgr.get('h').dtype.type, np.float16) def test_interleave(self): pass @@ -512,69 +566,79 @@ def test_consolidate_ordering_issues(self): cons = self.mgr.consolidate() self.assertEquals(cons.nblocks, 1) - self.assertTrue(cons.blocks[0].items.equals(cons.items)) + assert_almost_equal(cons.blocks[0].mgr_locs, + np.arange(len(cons.items))) def test_reindex_index(self): pass def test_reindex_items(self): - def _check_cols(before, after, cols): - for col in cols: - assert_almost_equal(after.get(col), before.get(col)) - - # not consolidated - vals = randn(N) - self.mgr.set('g', vals) - reindexed = self.mgr.reindex_items(['g', 'c', 'a', 'd']) + # mgr is not consolidated, f8 & f8-2 blocks + mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;' + 'f: bool; g: f8-2') + + reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0) self.assertEquals(reindexed.nblocks, 2) - assert_almost_equal(reindexed.get('g'), vals.squeeze()) - _check_cols(self.mgr, reindexed, ['c', 'a', 'd']) + assert_almost_equal(reindexed.items, ['g', 'c', 'a', 'd']) + assert_almost_equal(mgr.get('g'), reindexed.get('g')) + assert_almost_equal(mgr.get('c'), reindexed.get('c')) + assert_almost_equal(mgr.get('a'), reindexed.get('a')) + assert_almost_equal(mgr.get('d'), reindexed.get('d')) + + def test_multiindex_xs(self): + mgr = create_mgr('a,b,c: f8; d,e,f: i8') - def test_xs(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) - self.mgr.set_axis(1, index) + mgr.set_axis(1, index) + result = mgr.xs('bar', axis=1) + self.assertEqual(result.shape, (6, 2)) + self.assertEqual(result.axes[1][0], ('bar', 'one')) + self.assertEqual(result.axes[1][1], ('bar', 'two')) - result = self.mgr.xs('bar', axis=1) - expected = self.mgr.get_slice(slice(3, 5), axis=1) + def test_get_numeric_data(self): + mgr = create_mgr('int: int; float: float; complex: complex;' + 'str: object; bool: bool; obj: object; dt: datetime', + item_shape=(3,)) + mgr.set('obj', np.array([1, 2, 3], dtype=np.object_)) - assert_frame_equal(DataFrame(result), DataFrame(expected)) + numeric = mgr.get_numeric_data() + assert_almost_equal(numeric.items, ['int', 'float', 'complex', 'bool']) + assert_almost_equal(mgr.get('float'), numeric.get('float')) - def test_get_numeric_data(self): - int_ser = Series(np.array([0, 1, 2])) - float_ser = Series(np.array([0., 1., 2.])) - complex_ser = Series(np.array([0j, 1j, 2j])) - str_ser = Series(np.array(['a', 'b', 'c'])) - bool_ser = Series(np.array([True, False, True])) - obj_ser = Series(np.array([1, 'a', 5])) - dt_ser = Series(tm.makeDateIndex(3)) - # check types - df = DataFrame({'int': int_ser, 'float': float_ser, - 'complex': complex_ser, 'str': str_ser, - 'bool': bool_ser, 'obj': obj_ser, - 'dt': dt_ser}) - xp = DataFrame({'int': int_ser, 'float': float_ser, - 'complex': complex_ser, 'bool': bool_ser}) - rs = DataFrame(df._data.get_numeric_data()) - assert_frame_equal(xp, rs) - - xp = DataFrame({'bool': bool_ser}) - rs = DataFrame(df._data.get_bool_data()) - assert_frame_equal(xp, rs) - - rs = DataFrame(df._data.get_bool_data()) - df.ix[0, 'bool'] = not df.ix[0, 'bool'] - - self.assertEqual(rs.ix[0, 'bool'], df.ix[0, 'bool']) - - rs = DataFrame(df._data.get_bool_data(copy=True)) - df.ix[0, 'bool'] = not df.ix[0, 'bool'] - - self.assertEqual(rs.ix[0, 'bool'], not df.ix[0, 'bool']) + # Check sharing + numeric.set('float', np.array([100., 200., 300.])) + assert_almost_equal(mgr.get('float'), np.array([100., 200., 300.])) + + numeric2 = mgr.get_numeric_data(copy=True) + assert_almost_equal(numeric.items, ['int', 'float', 'complex', 'bool']) + numeric2.set('float', np.array([1000., 2000., 3000.])) + assert_almost_equal(mgr.get('float'), np.array([100., 200., 300.])) + + def test_get_bool_data(self): + mgr = create_mgr('int: int; float: float; complex: complex;' + 'str: object; bool: bool; obj: object; dt: datetime', + item_shape=(3,)) + mgr.set('obj', np.array([True, False, True], dtype=np.object_)) + + bools = mgr.get_bool_data() + assert_almost_equal(bools.items, ['bool']) + assert_almost_equal(mgr.get('bool'), bools.get('bool')) + + bools.set('bool', np.array([True, False, True])) + assert_almost_equal(mgr.get('bool'), [True, False, True]) + + # Check sharing + bools2 = mgr.get_bool_data(copy=True) + bools2.set('bool', np.array([False, True, False])) + assert_almost_equal(mgr.get('bool'), [True, False, True]) + + def test_unicode_repr_doesnt_raise(self): + str_repr = repr(create_mgr(u('b,\u05d0: object'))) def test_missing_unicode_key(self): df = DataFrame({"a": [1]}) @@ -585,26 +649,342 @@ def test_missing_unicode_key(self): def test_equals(self): # unique items - index = Index(list('abcdef')) - block1 = make_block(np.arange(12).reshape(3,4), list('abc'), index) - block2 = make_block(np.arange(12).reshape(3,4)*10, list('def'), index) - block1.ref_items = block2.ref_items = index - bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) - bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) + bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2') + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) - # non-unique items - index = Index(list('aaabbb')) - block1 = make_block(np.arange(12).reshape(3,4), list('aaa'), index, - placement=[0,1,2]) - block2 = make_block(np.arange(12).reshape(3,4)*10, list('bbb'), index, - placement=[3,4,5]) - block1.ref_items = block2.ref_items = index - bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) - bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) + bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2') + bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) + def test_single_mgr_ctor(self): + mgr = create_single_mgr('f8', num_rows=5) + self.assertEquals(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.]) + + +class TestIndexing(object): + # Nosetests-style data-driven tests. + # + # This test applies different indexing routines to block managers and + # compares the outcome to the result of same operations on np.ndarray. + # + # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests + # and are disabled. + + MANAGERS = [ + create_single_mgr('f8', N), + create_single_mgr('i8', N), + #create_single_mgr('sparse', N), + create_single_mgr('sparse_na', N), + + # 2-dim + create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)), + create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)), + create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)), + create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)), + #create_mgr('a: sparse', item_shape=(N,)), + create_mgr('a: sparse_na', item_shape=(N,)), + + # 3-dim + create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)), + create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)), + create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)), + create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)), + # create_mgr('a: sparse', item_shape=(1, N)), + ] + + # MANAGERS = [MANAGERS[6]] + + def test_get_slice(self): + def assert_slice_ok(mgr, axis, slobj): + # import pudb; pudb.set_trace() + mat = mgr.as_matrix() + sliced = mgr.get_slice(slobj, axis=axis) + mat_slobj = (slice(None),) * axis + (slobj,) + assert_almost_equal(mat[mat_slobj], sliced.as_matrix()) + assert_almost_equal(mgr.axes[axis][slobj], sliced.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # slice + yield assert_slice_ok, mgr, ax, slice(None) + yield assert_slice_ok, mgr, ax, slice(3) + yield assert_slice_ok, mgr, ax, slice(100) + yield assert_slice_ok, mgr, ax, slice(1, 4) + yield assert_slice_ok, mgr, ax, slice(3, 0, -2) + + # boolean mask + yield assert_slice_ok, mgr, ax, np.array([], dtype=np.bool_) + yield (assert_slice_ok, mgr, ax, + np.ones(mgr.shape[ax], dtype=np.bool_)) + yield (assert_slice_ok, mgr, ax, + np.zeros(mgr.shape[ax], dtype=np.bool_)) + + if mgr.shape[ax] >= 3: + yield (assert_slice_ok, mgr, ax, + np.arange(mgr.shape[ax]) % 3 == 0) + yield (assert_slice_ok, mgr, ax, + np.array([True, True, False], dtype=np.bool_)) + + # fancy indexer + yield assert_slice_ok, mgr, ax, [] + yield assert_slice_ok, mgr, ax, lrange(mgr.shape[ax]) + + if mgr.shape[ax] >= 3: + yield assert_slice_ok, mgr, ax, [0, 1, 2] + yield assert_slice_ok, mgr, ax, [-1, -2, -3] + + def test_take(self): + def assert_take_ok(mgr, axis, indexer): + mat = mgr.as_matrix() + taken = mgr.take(indexer, axis) + assert_almost_equal(np.take(mat, indexer, axis), + taken.as_matrix()) + assert_almost_equal(mgr.axes[axis].take(indexer), + taken.axes[axis]) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + # take/fancy indexer + yield assert_take_ok, mgr, ax, [] + yield assert_take_ok, mgr, ax, [0, 0, 0] + yield assert_take_ok, mgr, ax, lrange(mgr.shape[ax]) + + if mgr.shape[ax] >= 3: + yield assert_take_ok, mgr, ax, [0, 1, 2] + yield assert_take_ok, mgr, ax, [-1, -2, -3] + + def test_reindex_axis(self): + def assert_reindex_axis_is_ok(mgr, axis, new_labels, + fill_value): + mat = mgr.as_matrix() + indexer = mgr.axes[axis].get_indexer_for(new_labels) + + reindexed = mgr.reindex_axis(new_labels, axis, + fill_value=fill_value) + assert_almost_equal(com.take_nd(mat, indexer, axis, + fill_value=fill_value), + reindexed.as_matrix()) + assert_almost_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.): + yield assert_reindex_axis_is_ok, mgr, ax, [], fill_value + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][[0, 0, 0]], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + ['foo', mgr.axes[ax][0], 'baz'], fill_value) + + if mgr.shape[ax] >= 3: + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][:-3], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][-3::-1], fill_value) + yield (assert_reindex_axis_is_ok, mgr, ax, + mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value) + + def test_reindex_indexer(self): + def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, + fill_value): + mat = mgr.as_matrix() + reindexed_mat = com.take_nd(mat, indexer, axis, + fill_value=fill_value) + reindexed = mgr.reindex_indexer(new_labels, indexer, axis, + fill_value=fill_value) + assert_almost_equal(reindexed_mat, reindexed.as_matrix()) + assert_almost_equal(reindexed.axes[axis], new_labels) + + for mgr in self.MANAGERS: + for ax in range(mgr.ndim): + for fill_value in (None, np.nan, 100.): + yield (assert_reindex_indexer_is_ok, mgr, ax, + [], [], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo'] * mgr.shape[ax], np.arange(mgr.shape[ax]), + fill_value) + + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), + fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], + fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [0, 0, 0], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [-1, 0, -1], fill_value) + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', mgr.axes[ax][0], 'baz'], [-1, -1, -1], + fill_value) + + if mgr.shape[ax] >= 3: + yield (assert_reindex_indexer_is_ok, mgr, ax, + ['foo', 'bar', 'baz'], [0, 1, 2], fill_value) + + + # test_get_slice(slice_like, axis) + # take(indexer, axis) + # reindex_axis(new_labels, axis) + # reindex_indexer(new_labels, indexer, axis) + + + + +class TestBlockPlacement(tm.TestCase): + _multiprocess_can_split_ = True + + def test_slice_len(self): + self.assertEquals(len(BlockPlacement(slice(0, 4))), 4) + self.assertEquals(len(BlockPlacement(slice(0, 4, 2))), 2) + self.assertEquals(len(BlockPlacement(slice(0, 3, 2))), 2) + + self.assertEquals(len(BlockPlacement(slice(0, 1, 2))), 1) + self.assertEquals(len(BlockPlacement(slice(1, 0, -1))), 1) + + def test_zero_step_raises(self): + self.assertRaises(ValueError, BlockPlacement, slice(1, 1, 0)) + self.assertRaises(ValueError, BlockPlacement, slice(1, 2, 0)) + + def test_unbounded_slice_raises(self): + def assert_unbounded_slice_error(slc): + # assertRaisesRegexp is not available in py2.6 + # self.assertRaisesRegexp(ValueError, "unbounded slice", + # lambda: BlockPlacement(slc)) + self.assertRaises(ValueError, BlockPlacement, slc) + + assert_unbounded_slice_error(slice(None, None)) + assert_unbounded_slice_error(slice(10, None)) + assert_unbounded_slice_error(slice(None, None, -1)) + assert_unbounded_slice_error(slice(None, 10, -1)) + + # These are "unbounded" because negative index will change depending on + # container shape. + assert_unbounded_slice_error(slice(-1, None)) + assert_unbounded_slice_error(slice(None, -1)) + assert_unbounded_slice_error(slice(-1, -1)) + assert_unbounded_slice_error(slice(-1, None, -1)) + assert_unbounded_slice_error(slice(None, -1, -1)) + assert_unbounded_slice_error(slice(-1, -1, -1)) + + def test_not_slice_like_slices(self): + def assert_not_slice_like(slc): + self.assertTrue(not BlockPlacement(slc).is_slice_like) + + assert_not_slice_like(slice(0, 0)) + assert_not_slice_like(slice(100, 0)) + + assert_not_slice_like(slice(100, 100, -1)) + assert_not_slice_like(slice(0, 100, -1)) + + self.assertTrue(not BlockPlacement(slice(0, 0)).is_slice_like) + self.assertTrue(not BlockPlacement(slice(100, 100)).is_slice_like) + + def test_array_to_slice_conversion(self): + def assert_as_slice_equals(arr, slc): + self.assertEquals(BlockPlacement(arr).as_slice, slc) + + assert_as_slice_equals([0], slice(0, 1, 1)) + assert_as_slice_equals([100], slice(100, 101, 1)) + + assert_as_slice_equals([0, 1, 2], slice(0, 3, 1)) + assert_as_slice_equals([0, 5, 10], slice(0, 15, 5)) + assert_as_slice_equals([0, 100], slice(0, 200, 100)) + + assert_as_slice_equals([2, 1], slice(2, 0, -1)) + assert_as_slice_equals([2, 1, 0], slice(2, None, -1)) + assert_as_slice_equals([100, 0], slice(100, None, -100)) + + def test_not_slice_like_arrays(self): + def assert_not_slice_like(arr): + self.assertTrue(not BlockPlacement(arr).is_slice_like) + + assert_not_slice_like([]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, -2, -3]) + assert_not_slice_like([-10]) + assert_not_slice_like([-1]) + assert_not_slice_like([-1, 0, 1, 2]) + assert_not_slice_like([-2, 0, 2, 4]) + assert_not_slice_like([1, 0, -1]) + assert_not_slice_like([1, 1, 1]) + + def test_slice_iter(self): + self.assertEquals(list(BlockPlacement(slice(0, 3))), [0, 1, 2]) + self.assertEquals(list(BlockPlacement(slice(0, 0))), []) + self.assertEquals(list(BlockPlacement(slice(3, 0))), []) + + self.assertEquals(list(BlockPlacement(slice(3, 0, -1))), [3, 2, 1]) + self.assertEquals(list(BlockPlacement(slice(3, None, -1))), + [3, 2, 1, 0]) + + def test_slice_to_array_conversion(self): + def assert_as_array_equals(slc, asarray): + np.testing.assert_array_equal( + BlockPlacement(slc).as_array, + np.asarray(asarray)) + + assert_as_array_equals(slice(0, 3), [0, 1, 2]) + assert_as_array_equals(slice(0, 0), []) + assert_as_array_equals(slice(3, 0), []) + + assert_as_array_equals(slice(3, 0, -1), [3, 2, 1]) + assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0]) + assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1]) + + def test_blockplacement_add(self): + bpl = BlockPlacement(slice(0, 5)) + self.assertEquals(bpl.add(1).as_slice, slice(1, 6, 1)) + self.assertEquals(bpl.add(np.arange(5)).as_slice, + slice(0, 10, 2)) + self.assertEquals(list(bpl.add(np.arange(5, 0, -1))), + [5, 5, 5, 5, 5]) + + def test_blockplacement_add_int(self): + def assert_add_equals(val, inc, result): + self.assertEquals(list(BlockPlacement(val).add(inc)), + result) + + assert_add_equals(slice(0, 0), 0, []) + assert_add_equals(slice(1, 4), 0, [1, 2, 3]) + assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1]) + assert_add_equals(slice(2, None, -1), 0, [2, 1, 0]) + assert_add_equals([1, 2, 4], 0, [1, 2, 4]) + + assert_add_equals(slice(0, 0), 10, []) + assert_add_equals(slice(1, 4), 10, [11, 12, 13]) + assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11]) + assert_add_equals(slice(2, None, -1), 10, [12, 11, 10]) + assert_add_equals([1, 2, 4], 10, [11, 12, 14]) + + assert_add_equals(slice(0, 0), -1, []) + assert_add_equals(slice(1, 4), -1, [0, 1, 2]) + assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0]) + assert_add_equals([1, 2, 4], -1, [0, 1, 3]) + + self.assertRaises(ValueError, + lambda: BlockPlacement(slice(1, 4)).add(-10)) + self.assertRaises(ValueError, + lambda: BlockPlacement([1, 2, 4]).add(-10)) + self.assertRaises(ValueError, + lambda: BlockPlacement(slice(2, None, -1)).add(-1)) + + # def test_blockplacement_array_add(self): + + # assert_add_equals(slice(0, 2), [0, 1, 1], [0, 2, 3]) + # assert_add_equals(slice(2, None, -1), [1, 1, 0], [3, 2, 0]) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) + + + diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1eb43237c3185..a6c2bb9f56602 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -611,7 +611,7 @@ def test_setitem_change_dtype(self): s = dft['foo', 'two'] dft['foo', 'two'] = s > s.median() assert_series_equal(dft['foo', 'two'], s > s.median()) - tm.assert_isinstance(dft._data.blocks[1].items, MultiIndex) + # tm.assert_isinstance(dft._data.blocks[1].items, MultiIndex) reindexed = dft.reindex(columns=[('foo', 'two')]) assert_series_equal(reindexed['foo', 'two'], s > s.median()) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 935dfb65a0807..d17e2e2dcb12b 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -14,12 +14,10 @@ from pandas.core.index import (Index, MultiIndex, _get_combined_index, _ensure_index, _get_consensus_names, _all_indexes_same) -from pandas.core.internals import (TimeDeltaBlock, IntBlock, BoolBlock, BlockManager, - make_block, _consolidate) -from pandas.util.decorators import cache_readonly, Appender, Substitution -from pandas.core.common import (PandasError, ABCSeries, - is_timedelta64_dtype, is_datetime64_dtype, - is_integer_dtype, isnull) +from pandas.core.internals import (items_overlap_with_suffix, + concatenate_block_managers) +from pandas.util.decorators import Appender, Substitution +from pandas.core.common import ABCSeries from pandas.io.parsers import TextFileReader import pandas.core.common as com @@ -27,7 +25,7 @@ import pandas.lib as lib import pandas.algos as algos import pandas.hashtable as _hash -import pandas.tslib as tslib + @Substitution('\nleft : DataFrame') @Appender(_merge_doc, indents=0) @@ -186,16 +184,20 @@ def __init__(self, left, right, how='inner', on=None, def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - # this is a bit kludgy - ldata, rdata = self._get_merge_data() + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) + + lindexers = {1: left_indexer} if left_indexer is not None else {} + rindexers = {1: right_indexer} if right_indexer is not None else {} - # TODO: more efficiently handle group keys to avoid extra - # consolidation! - join_op = _BlockJoinOperation([ldata, rdata], join_index, - [left_indexer, right_indexer], axis=1, - copy=self.copy) + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) - result_data = join_op.get_result() result = DataFrame(result_data).__finalize__(self, method='merge') self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -281,8 +283,18 @@ def _get_merge_data(self): """ ldata, rdata = self.left._data, self.right._data lsuf, rsuf = self.suffixes - ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf, - copydata=False) + + llabels, rlabels = items_overlap_with_suffix( + ldata.items, lsuf, rdata.items, rsuf) + + if not llabels.equals(ldata.items): + ldata = ldata.copy(deep=False) + ldata.set_axis(0, llabels) + + if not rlabels.equals(rdata.items): + rdata = rdata.copy(deep=False) + rdata.set_axis(0, rlabels) + return ldata, rdata def _get_merge_keys(self): @@ -410,14 +422,14 @@ def _validate_specification(self): if self.right_index: if len(self.left_on) != self.right.index.nlevels: raise ValueError('len(left_on) must equal the number ' - 'of levels in the index of "right"') + 'of levels in the index of "right"') self.right_on = [None] * n elif self.right_on is not None: n = len(self.right_on) if self.left_index: if len(self.right_on) != self.left.index.nlevels: raise ValueError('len(right_on) must equal the number ' - 'of levels in the index of "left"') + 'of levels in the index of "left"') self.left_on = [None] * n if len(self.right_on) != len(self.left_on): raise ValueError("len(right_on) must equal len(left_on)") @@ -487,7 +499,11 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() # this is a bit kludgy - ldata, rdata = self._get_merge_data() + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + + llabels, rlabels = items_overlap_with_suffix(ldata.items, lsuf, + rdata.items, rsuf) if self.fill_method == 'ffill': left_join_indexer = algos.ffill_indexer(left_indexer) @@ -496,11 +512,14 @@ def get_result(self): left_join_indexer = left_indexer right_join_indexer = right_indexer - join_op = _BlockJoinOperation([ldata, rdata], join_index, - [left_join_indexer, right_join_indexer], - axis=1, copy=self.copy) + lindexers = {1: left_join_indexer} if left_join_indexer is not None else {} + rindexers = {1: right_join_indexer} if right_join_indexer is not None else {} + + result_data = concatenate_block_managers( + [(ldata, lindexers), (rdata, rindexers)], + axes=[llabels.append(rlabels), join_index], + concat_axis=0, copy=self.copy) - result_data = join_op.get_result() result = DataFrame(result_data) self._maybe_add_join_keys(result, left_indexer, right_indexer) @@ -640,238 +659,6 @@ def _sort_labels(uniques, left, right): return new_left, new_right -class _BlockJoinOperation(object): - """ - BlockJoinOperation made generic for N DataFrames - - Object responsible for orchestrating efficient join operation between two - BlockManager data structures - """ - def __init__(self, data_list, join_index, indexers, axis=1, copy=True): - if axis <= 0: # pragma: no cover - raise MergeError('Only axis >= 1 supported for this operation') - - if len(data_list) != len(indexers): - raise AssertionError("data_list and indexers must have the same " - "length") - - self.units = [] - for data, indexer in zip(data_list, indexers): - if not data.is_consolidated(): - data = data.consolidate() - data._set_ref_locs() - self.units.append(_JoinUnit(data.blocks, indexer)) - - self.join_index = join_index - self.axis = axis - self.copy = copy - self.offsets = None - - # do NOT sort - self.result_items = _concat_indexes([d.items for d in data_list]) - self.result_axes = list(data_list[0].axes) - self.result_axes[0] = self.result_items - self.result_axes[axis] = self.join_index - - def _prepare_blocks(self): - blockmaps = [] - - for unit in self.units: - join_blocks = unit.get_upcasted_blocks() - type_map = {} - for blk in join_blocks: - type_map.setdefault(blk.ftype, []).append(blk) - blockmaps.append((unit, type_map)) - - return blockmaps - - def get_result(self): - """ - Returns - ------- - merged : BlockManager - """ - blockmaps = self._prepare_blocks() - kinds = _get_merge_block_kinds(blockmaps) - - # maybe want to enable flexible copying <-- what did I mean? - kind_blocks = [] - for klass in kinds: - klass_blocks = [] - for unit, mapping in blockmaps: - if klass in mapping: - klass_blocks.extend((unit, b) for b in mapping[klass]) - - # blocks that we are going to merge - kind_blocks.append(klass_blocks) - - # create the merge offsets, essentially where the resultant blocks go in the result - if not self.result_items.is_unique: - - # length of the merges for each of the klass blocks - self.offsets = np.zeros(len(blockmaps)) - for kb in kind_blocks: - kl = list(b.get_merge_length() for unit, b in kb) - self.offsets += np.array(kl) - - # merge the blocks to create the result blocks - result_blocks = [] - for klass_blocks in kind_blocks: - res_blk = self._get_merged_block(klass_blocks) - result_blocks.append(res_blk) - - return BlockManager(result_blocks, self.result_axes) - - def _get_merged_block(self, to_merge): - if len(to_merge) > 1: - - # placement set here - return self._merge_blocks(to_merge) - else: - unit, block = to_merge[0] - blk = unit.reindex_block(block, self.axis, - self.result_items, copy=self.copy) - - # set placement / invalidate on a unique result - if self.result_items.is_unique and blk._ref_locs is not None: - if not self.copy: - blk = blk.copy() - blk.set_ref_locs(None) - - return blk - - - def _merge_blocks(self, merge_chunks): - """ - merge_chunks -> [(_JoinUnit, Block)] - """ - funit, fblock = merge_chunks[0] - fidx = funit.indexer - - out_shape = list(fblock.get_values().shape) - - n = len(fidx) if fidx is not None else out_shape[self.axis] - - merge_lengths = list(blk.get_merge_length() for unit, blk in merge_chunks) - out_shape[0] = sum(merge_lengths) - out_shape[self.axis] = n - - # Should use Fortran order?? - block_dtype = _get_block_dtype([x[1] for x in merge_chunks]) - out = np.empty(out_shape, dtype=block_dtype) - - sofar = 0 - for unit, blk in merge_chunks: - out_chunk = out[sofar: sofar + len(blk)] - com.take_nd(blk.get_values(), unit.indexer, self.axis, out=out_chunk) - sofar += len(blk) - - # does not sort - new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) - - # need to set placement if we have a non-unique result - # calculate by the existing placement plus the offset in the result set - placement = None - if not self.result_items.is_unique: - placement = [] - offsets = np.append(np.array([0]),self.offsets.cumsum()[:-1]) - for (unit, blk), offset in zip(merge_chunks,offsets): - placement.extend(blk.ref_locs+offset) - - return make_block(out, new_block_items, self.result_items, placement=placement) - - -class _JoinUnit(object): - """ - Blocks plus indexer - """ - - def __init__(self, blocks, indexer): - self.blocks = blocks - self.indexer = indexer - - @cache_readonly - def mask_info(self): - if self.indexer is None or not _may_need_upcasting(self.blocks): - return None - else: - mask = self.indexer == -1 - needs_masking = mask.any() - return (mask, needs_masking) - - def get_upcasted_blocks(self): - # will short-circuit and not compute needs_masking if indexer is None - if self.mask_info is not None and self.mask_info[1]: - return _upcast_blocks(self.blocks) - return self.blocks - - def reindex_block(self, block, axis, ref_items, copy=True): - if self.indexer is None: - result = block.copy() if copy else block - else: - result = block.reindex_axis(self.indexer, axis=axis, - mask_info=self.mask_info) - result.ref_items = ref_items - return result - - -def _may_need_upcasting(blocks): - for block in blocks: - if isinstance(block, (IntBlock, BoolBlock)) and not isinstance(block, TimeDeltaBlock): - return True - return False - - -def _upcast_blocks(blocks): - """ - Upcast and consolidate if necessary - """ - new_blocks = [] - for block in blocks: - if isinstance(block, TimeDeltaBlock): - # these are int blocks underlying, but are ok - newb = block - elif isinstance(block, IntBlock): - newb = make_block(block.values.astype(float), block.items, - block.ref_items, placement=block._ref_locs) - elif isinstance(block, BoolBlock): - newb = make_block(block.values.astype(object), block.items, - block.ref_items, placement=block._ref_locs) - else: - newb = block - new_blocks.append(newb) - - # use any ref_items - return _consolidate(new_blocks, newb.ref_items) - - -def _get_all_block_kinds(blockmaps): - kinds = set() - for mapping in blockmaps: - kinds |= set(mapping) - return kinds - - -def _get_merge_block_kinds(blockmaps): - kinds = set() - for _, mapping in blockmaps: - kinds |= set(mapping) - return kinds - - -def _get_block_dtype(blocks): - if len(blocks) == 0: - return object - blk1 = blocks[0] - dtype = blk1.dtype - - if issubclass(dtype.type, np.floating): - for blk in blocks: - if blk.dtype.type == np.float64: - return blk.dtype - - return dtype - #---------------------------------------------------------------------- # Concatenate DataFrame objects @@ -1061,220 +848,38 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.new_axes = self._get_new_axes() def get_result(self): - if self._is_series and self.axis == 0: - new_data = com._concat_compat([x.get_values() for x in self.objs]) - name = com._consensus_name_attr(self.objs) - new_data = self._post_merge(new_data) - return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') - elif self._is_series: - data = dict(zip(range(len(self.objs)), self.objs)) - index, columns = self.new_axes - tmpdf = DataFrame(data, index=index) - if columns is not None: - tmpdf.columns = columns - return tmpdf.__finalize__(self, method='concat') + if self._is_series: + if self.axis == 0: + new_data = com._concat_compat([x.get_values() for x in self.objs]) + name = com._consensus_name_attr(self.objs) + return Series(new_data, index=self.new_axes[0], name=name).__finalize__(self, method='concat') + else: + data = dict(zip(range(len(self.objs)), self.objs)) + index, columns = self.new_axes + tmpdf = DataFrame(data, index=index) + if columns is not None: + tmpdf.columns = columns + return tmpdf.__finalize__(self, method='concat') else: - new_data = self._get_concatenated_data() - new_data = self._post_merge(new_data) - return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') + mgrs_indexers = [] + for obj in self.objs: + mgr = obj._data + indexers = {} + for ax, new_labels in enumerate(self.new_axes): + if ax == self.axis: + # Suppress reindexing on concat axis + continue - def _post_merge(self, data): - if isinstance(data, BlockManager): - data = data.post_merge(self.objs) - return data - - def _get_fresh_axis(self): - return Index(np.arange(len(self._get_concat_axis()))) - - def _prepare_blocks(self): - reindexed_data = self._get_reindexed_data() - - # we are consolidating as we go, so just add the blocks, no-need for dtype mapping - blockmaps = [] - for data in reindexed_data: - data = data.consolidate() - data._set_ref_locs() - blockmaps.append(data.get_block_map(typ='dict')) - return blockmaps, reindexed_data - - def _get_concatenated_data(self): - # need to conform to same other (joined) axes for block join - blockmaps, rdata = self._prepare_blocks() - kinds = _get_all_block_kinds(blockmaps) - - try: - # need to conform to same other (joined) axes for block join - new_blocks = [] - for kind in kinds: - klass_blocks = [] - for mapping in blockmaps: - l = mapping.get(kind) - if l is None: - l = [ None ] - klass_blocks.extend(l) - stacked_block = self._concat_blocks(klass_blocks) - new_blocks.append(stacked_block) - - if self.axis == 0 and self.ignore_index: - self.new_axes[0] = self._get_fresh_axis() - - for blk in new_blocks: - blk.ref_items = self.new_axes[0] - - new_data = BlockManager(new_blocks, self.new_axes) - - # Eventual goal would be to move everything to PandasError or other explicit error - except (Exception, PandasError): # EAFP - - # should not be possible to fail here for the expected reason with - # axis = 0 - if self.axis == 0: # pragma: no cover - raise - - new_data = {} - for item in self.new_axes[0]: - new_data[item] = self._concat_single_item(rdata, item) - - return new_data - - def _get_reindexed_data(self): - # HACK: ugh - - reindexed_data = [] - axes_to_reindex = list(enumerate(self.new_axes)) - axes_to_reindex.pop(self.axis) - - for obj in self.objs: - data = obj._data.prepare_for_merge() - for i, ax in axes_to_reindex: - data = data.reindex_axis(ax, axis=i, copy=False) - reindexed_data.append(data) - - return reindexed_data - - def _concat_blocks(self, blocks): - - values_list = [b.get_values() for b in blocks if b is not None] - concat_values = com._concat_compat(values_list, axis=self.axis) - - if self.axis > 0: - # Not safe to remove this check, need to profile - if not _all_indexes_same([b.items for b in blocks]): - # TODO: Either profile this piece or remove. - # FIXME: Need to figure out how to test whether this line exists or does not...(unclear if even possible - # or maybe would require performance test) - raise PandasError('dtypes are not consistent throughout ' - 'DataFrames') - return make_block(concat_values, - blocks[0].items, - self.new_axes[0], - placement=blocks[0]._ref_locs) - else: + obj_labels = mgr.axes[ax] + if not new_labels.equals(obj_labels): + indexers[ax] = obj_labels.reindex(new_labels)[1] - offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for - x in self.objs])] - indexer = np.concatenate([offsets[i] + b.ref_locs - for i, b in enumerate(blocks) - if b is not None]) - if self.ignore_index: - concat_items = indexer - else: - concat_items = self.new_axes[0].take(indexer) - - if self.ignore_index: - ref_items = self._get_fresh_axis() - return make_block(concat_values, concat_items, ref_items) - - block = make_block(concat_values, concat_items, self.new_axes[0]) - - # we need to set the ref_locs in this block so we have the mapping - # as we now have a non-unique index across dtypes, and we need to - # map the column location to the block location - # GH3602 - if not self.new_axes[0].is_unique: - block.set_ref_locs(indexer) - - return block - - def _concat_single_item(self, objs, item): - # this is called if we don't have consistent dtypes in a row-wise append - all_values = [] - dtypes = [] - alls = set() - - # figure out the resulting dtype of the combination - for data, orig in zip(objs, self.objs): - d = dict([ (t,False) for t in ['object','datetime','timedelta','other'] ]) - if item in orig: - values = data.get(item) - if hasattr(values,'to_dense'): - values = values.to_dense() - all_values.append(values) - - dtype = values.dtype - - if issubclass(dtype.type, (np.object_, np.bool_)): - d['object'] = True - alls.add('object') - elif is_datetime64_dtype(dtype): - d['datetime'] = True - alls.add('datetime') - elif is_timedelta64_dtype(dtype): - d['timedelta'] = True - alls.add('timedelta') - else: - d['other'] = True - alls.add('other') + mgrs_indexers.append((obj._data, indexers)) - else: - all_values.append(None) - d['other'] = True - alls.add('other') - - dtypes.append(d) - - if 'datetime' in alls or 'timedelta' in alls: - - if 'object' in alls or 'other' in alls: - - for v, d in zip(all_values,dtypes): - if d.get('datetime') or d.get('timedelta'): - pass - - # if we have all null, then leave a date/time like type - # if we have only that type left - elif v is None or isnull(v).all(): - - alls.discard('other') - alls.discard('object') - - # create the result - if 'object' in alls: - empty_dtype, fill_value = np.object_, np.nan - elif 'other' in alls: - empty_dtype, fill_value = np.float64, np.nan - elif 'datetime' in alls: - empty_dtype, fill_value = 'M8[ns]', tslib.iNaT - elif 'timedelta' in alls: - empty_dtype, fill_value = 'm8[ns]', tslib.iNaT - else: # pragma - raise AssertionError("invalid dtype determination in concat_single_item") - - to_concat = [] - for obj, item_values in zip(objs, all_values): - if item_values is None or isnull(item_values).all(): - shape = obj.shape[1:] - missing_arr = np.empty(shape, dtype=empty_dtype) - missing_arr.fill(fill_value) - to_concat.append(missing_arr) - else: - to_concat.append(item_values) + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True) - # this method only gets called with axis >= 1 - if self.axis < 1: - raise AssertionError("axis must be >= 1, input was" - " {0}".format(self.axis)) - return com._concat_compat(to_concat, axis=self.axis - 1) + return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') def _get_result_dim(self): if self._is_series and self.axis == 1: @@ -1303,13 +908,7 @@ def _get_new_axes(self): for i, ax in zip(indices, self.join_axes): new_axes[i] = ax - if self.ignore_index: - concat_axis = None - else: - concat_axis = self._get_concat_axis() - - new_axes[self.axis] = concat_axis - + new_axes[self.axis] = self._get_concat_axis() return new_axes def _get_comb_axis(self, i): @@ -1325,9 +924,16 @@ def _get_comb_axis(self, i): return _get_combined_index(all_indexes, intersect=self.intersect) def _get_concat_axis(self): + """ + Return index to be used along concatenation axis. + """ if self._is_series: if self.axis == 0: indexes = [x.index for x in self.objs] + elif self.ignore_index: + idx = Index(np.arange(len(self.objs))) + idx.is_unique = True # arange is always unique + return idx elif self.keys is None: names = [] for x in self.objs: @@ -1338,13 +944,21 @@ def _get_concat_axis(self): if x.name is not None: names.append(x.name) else: - return Index(np.arange(len(self.objs))) + idx = Index(np.arange(len(self.objs))) + idx.is_unique = True + return idx + return Index(names) else: return _ensure_index(self.keys) else: indexes = [x._data.axes[self.axis] for x in self.objs] + if self.ignore_index: + idx = Index(np.arange(sum(len(i) for i in indexes))) + idx.is_unique = True + return idx + if self.keys is None: concat_axis = _concat_indexes(indexes) else: diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 146c244e7d775..8e11c78ecd135 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -584,6 +584,19 @@ def test_merge_different_column_key_names(self): assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan]) assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7]) + def test_merge_copy(self): + left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) + right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) + + merged = merge(left, right, left_index=True, + right_index=True, copy=True) + + merged['a'] = 6 + self.assert_((left['a'] == 0).all()) + + merged['d'] = 'peekaboo' + self.assert_((right['d'] == 'bar').all()) + def test_merge_nocopy(self): left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) @@ -1765,11 +1778,14 @@ def test_panel_join_overlap(self): p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] p2 = panel.ix[['ItemB', 'ItemC']] + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.ix[['ItemA']] - expected = p1_suf.join(p2_suf).join(no_overlap) + expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 23a6ae0982771..dd72a5245e7b2 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -337,7 +337,8 @@ def _take_new_index(obj, indexer, new_index, axis=0): elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError - return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1)) + return DataFrame(obj._data.reindex_indexer( + new_axis=new_index, indexer=indexer, axis=1)) else: raise NotImplementedError diff --git a/vb_suite/eval.py b/vb_suite/eval.py index 3b0efa9e88f48..36aa702b5602a 100644 --- a/vb_suite/eval.py +++ b/vb_suite/eval.py @@ -55,7 +55,7 @@ start_date=datetime(2013, 7, 26)) eval_frame_mult_python = \ - Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", + Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", common_setup, name='eval_frame_mult_python', start_date=datetime(2013, 7, 21)) @@ -102,7 +102,7 @@ name='eval_frame_chained_cmp_one_thread', start_date=datetime(2013, 7, 26)) -setup = common_setup +# setup = common_setup eval_frame_chained_cmp_python = \ Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", common_setup, name='eval_frame_chained_cmp_python',