Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
otsaloma committed Dec 14, 2024
1 parent 5654730 commit 23cc69a
Show file tree
Hide file tree
Showing 11 changed files with 23 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
select = E1,E9,F
ignore = E129
ignore = E125,E129
exclude = doc/comparison/blocks,venv
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ __pycache__
benchmark*.csv
benchmark-head.py
build
data/orig
dataiter.egg-info
dist
doc/_build
Expand Down
6 changes: 3 additions & 3 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ PENDING: Dataiter 0.999

This is a breaking change to switch the string data type from the
fixed-width `str_` a.k.a. `<U#` to the variable-width `StringDType`
introduced in NumPy 2.0. The main benefit is greatly reduced memory use
in many cases, making strings actually generally usable. The note about
stability below `0.99` still applies.
introduced in NumPy 2.0. The main benefit is greatly reduced memory use,
making strings usable without needing to be careful or falling back to
object. The note about stability below release 0.99 still applies.

2024-08-17: Dataiter 0.99
=========================
Expand Down
16 changes: 6 additions & 10 deletions dataiter/data_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,6 @@ def from_pandas(cls, data, *, dtypes={}):
`dtypes` is an optional dict mapping column names to NumPy datatypes.
"""
dtypes = dtypes.copy()
data = {x: data[x].to_numpy(copy=True) for x in data.columns}
for name, value in data.items():
# Pandas object columns are likely to be strings,
Expand Down Expand Up @@ -855,10 +854,8 @@ def read_csv(cls, path, *, encoding="utf-8", sep=",", header=True, columns=[], d
Return a new data frame from CSV file `path`.
Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
`columns` is an optional list of columns to limit to.
`dtypes` is an optional dict mapping column names to NumPy datatypes.
`columns` is an optional list of columns to limit to. `dtypes` is an
optional dict mapping column names to NumPy datatypes.
"""
import pandas as pd
data = pd.read_csv(path,
Expand All @@ -880,7 +877,6 @@ def read_json(cls, path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs):
Return a new data frame from JSON file `path`.
Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
`columns` is an optional list of columns to limit to. `dtypes` is an
optional dict mapping column names to NumPy datatypes. `kwargs` are
passed to ``json.load``.
Expand All @@ -904,9 +900,8 @@ def read_parquet(cls, path, *, columns=[], dtypes={}):
"""
Return a new data frame from Parquet file `path`.
`columns` is an optional list of columns to limit to.
`dtypes` is an optional dict mapping column names to NumPy datatypes.
`columns` is an optional list of columns to limit to. `dtypes` is an
optional dict mapping column names to NumPy datatypes.
"""
import pyarrow.parquet as pq
columns = columns or None
Expand Down Expand Up @@ -1189,6 +1184,8 @@ def unique(self, *colnames):
>>> data = di.read_csv("data/listings.csv")
>>> data.unique("hood")
"""
# Strings are not yet usable here, need to work around with rank.
# TypeError: The axis argument to unique is not supported for dtype StringDType
colnames = colnames or self.colnames
if (len(colnames) == 1 and
not self[colnames[0]].is_object() and
Expand Down Expand Up @@ -1256,7 +1253,6 @@ def write_json(self, path, *, encoding="utf-8", **kwargs):
Write data frame to JSON file `path`.
Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.
`kwargs` are passed to ``json.JSONEncoder``.
"""
return self.to_list_of_dicts().write_json(path, encoding=encoding, **kwargs)
Expand Down
16 changes: 6 additions & 10 deletions dataiter/geojson.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class GeoJSON(DataFrame):
A class for GeoJSON data.
GeoJSON is a simple wrapper class that reads GeoJSON features into a
:class:`.DataFrame`. Any operations on the data are thus done with
methods provided by the data frame class. Geometry is available in the
"geometry" column, but no special geometric operations are supported.
:class:`.DataFrame`. Any operations on the data are thus done with methods
provided by the data frame class. Geometry is available in the "geometry"
column, but no special geometric operations are supported.
All other data is available in the "metadata" attribute as an
``attd.AttributeDict``.
Expand Down Expand Up @@ -88,12 +88,9 @@ def read(cls, path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs):
Return data from GeoJSON file `path`.
Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
`columns` is an optional list of columns to limit to.
`dtypes` is an optional dict mapping column names to NumPy datatypes.
`kwargs` are passed to ``json.load``.
`columns` is an optional list of columns to limit to. `dtypes` is an
optional dict mapping column names to NumPy datatypes. `kwargs` are
passed to ``json.load``.
"""
with util.xopen(path, "rt", encoding=encoding) as f:
raw = AttributeDict(json.load(f, **kwargs))
Expand Down Expand Up @@ -137,7 +134,6 @@ def write(self, path, *, encoding="utf-8", **kwargs):
Write data to GeoJSON file `path`.
Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.
`kwargs` are passed to ``json.dump``.
"""
kwargs.setdefault("default", str)
Expand Down
4 changes: 1 addition & 3 deletions dataiter/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,9 @@ def count_digits(value):
return n, m

def format_alias_doc(alias, target):
indent = " " * 8
note = (
return f"{target.__doc__}\n\n{' '*8}" + (
".. note:: :func:`{}` is a convenience alias for :meth:`{}`."
.format(alias.__name__, target.__qualname__))
return target.__doc__ + "\n\n" + indent + note

def format_floats(seq, ksep=None):
precision = dataiter.PRINT_FLOAT_PRECISION
Expand Down
4 changes: 3 additions & 1 deletion dataiter/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,9 @@ def _np_array(cls, object, dtype=None):
# NumPy still defaults to fixed width strings.
# In some cases we can only fix the dtype ex-post.
if dtype is None:
if util.unique_types(object) == {str}:
if object and (
isinstance(object[0], str) and
isinstance(object[-1], str)):
dtype = dtypes.string
dtype = cls._map_input_dtype(dtype)
array = np.array(object, dtype)
Expand Down
2 changes: 1 addition & 1 deletion doc/aggregation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ faster as there's no more need to load anything.
``di.USE_NUMBA`` to see if Numba has been found. You can also
set ``di.USE_NUMBA = False`` if you have Numba installed, but
it's not working right, or via the environment variable
``DATAITER_USE_NUMBA=false``. Sometimes it's the just the
``DATAITER_USE_NUMBA=false``. Sometimes it's just the
`caching
<https://numba.readthedocs.io/en/stable/developer/caching.html>`_
part of Numba that's causing issues. When upgrading you might
Expand Down
1 change: 1 addition & 0 deletions doc/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"data-frame.rst": di.DataFrame,
"data-frame-column.rst": di.DataFrameColumn,
"dt.rst": di.dt,
"dtypes.rst": di.dtypes,
"geojson.rst": di.GeoJSON,
"list-of-dicts.rst": di.ListOfDicts,
"vector.rst": di.Vector,
Expand Down
1 change: 0 additions & 1 deletion doc/comparison/blocks/rbind-pandas.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
pd.concat([data1, data2])
data1.append(data2)
3 changes: 1 addition & 2 deletions doc/comparison/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,7 @@ <h2>Concatenation</h2>
<pre data-src="blocks/rbind-pandas.py"></pre>
<p class="note"></p>
<p class="note"></p>
<p class="note"><code>append</code>
is <a href="https://github.com/pandas-dev/pandas/issues/35407">deprecated</a>.</p>
<p class="note"></p>
</div>
<div class="grid-row" search-terms="cbind bind cols concat">
<pre data-src="blocks/cbind-dplyr.R"></pre>
Expand Down

0 comments on commit 23cc69a

Please sign in to comment.