Refactor

otsaloma · Dec 14, 2024 · 23cc69a · 23cc69a
1 parent 5654730
commit 23cc69a
Show file tree

Hide file tree

Showing 11 changed files with 23 additions and 33 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 select = E1,E9,F
-ignore = E129
+ignore = E125,E129
 exclude = doc/comparison/blocks,venv
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,6 @@ __pycache__
 benchmark*.csv
 benchmark-head.py
 build
-data/orig
 dataiter.egg-info
 dist
 doc/_build

diff --git a/NEWS.md b/NEWS.md
@@ -16,9 +16,9 @@ PENDING: Dataiter 0.999
 
 This is a breaking change to switch the string data type from the
 fixed-width `str_` a.k.a. `<U#` to the variable-width `StringDType`
-introduced in NumPy 2.0. The main benefit is greatly reduced memory use
-in many cases, making strings actually generally usable. The note about
-stability below `0.99` still applies.
+introduced in NumPy 2.0. The main benefit is greatly reduced memory use,
+making strings usable without needing to be careful or falling back to
+object. The note about stability below release 0.99 still applies.
 
 2024-08-17: Dataiter 0.99
 =========================

diff --git a/dataiter/data_frame.py b/dataiter/data_frame.py
@@ -514,7 +514,6 @@ def from_pandas(cls, data, *, dtypes={}):
 
         `dtypes` is an optional dict mapping column names to NumPy datatypes.
         """
-        dtypes = dtypes.copy()
         data = {x: data[x].to_numpy(copy=True) for x in data.columns}
         for name, value in data.items():
             # Pandas object columns are likely to be strings,
@@ -855,10 +854,8 @@ def read_csv(cls, path, *, encoding="utf-8", sep=",", header=True, columns=[], d
         Return a new data frame from CSV file `path`.
 
         Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
-
-        `columns` is an optional list of columns to limit to.
-
-        `dtypes` is an optional dict mapping column names to NumPy datatypes.
+        `columns` is an optional list of columns to limit to. `dtypes` is an
+        optional dict mapping column names to NumPy datatypes.
         """
         import pandas as pd
         data = pd.read_csv(path,
@@ -880,7 +877,6 @@ def read_json(cls, path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs):
         Return a new data frame from JSON file `path`.
 
         Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
-
         `columns` is an optional list of columns to limit to. `dtypes` is an
         optional dict mapping column names to NumPy datatypes. `kwargs` are
         passed to ``json.load``.
@@ -904,9 +900,8 @@ def read_parquet(cls, path, *, columns=[], dtypes={}):
         """
         Return a new data frame from Parquet file `path`.
 
-        `columns` is an optional list of columns to limit to.
-
-        `dtypes` is an optional dict mapping column names to NumPy datatypes.
+        `columns` is an optional list of columns to limit to. `dtypes` is an
+        optional dict mapping column names to NumPy datatypes.
         """
         import pyarrow.parquet as pq
         columns = columns or None
@@ -1189,6 +1184,8 @@ def unique(self, *colnames):
         >>> data = di.read_csv("data/listings.csv")
         >>> data.unique("hood")
         """
+        # Strings are not yet usable here, need to work around with rank.
+        # TypeError: The axis argument to unique is not supported for dtype StringDType
         colnames = colnames or self.colnames
         if (len(colnames) == 1 and
             not self[colnames[0]].is_object() and
@@ -1256,7 +1253,6 @@ def write_json(self, path, *, encoding="utf-8", **kwargs):
         Write data frame to JSON file `path`.
 
         Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.
-
         `kwargs` are passed to ``json.JSONEncoder``.
         """
         return self.to_list_of_dicts().write_json(path, encoding=encoding, **kwargs)

diff --git a/dataiter/geojson.py b/dataiter/geojson.py
@@ -34,9 +34,9 @@ class GeoJSON(DataFrame):
     A class for GeoJSON data.
 
     GeoJSON is a simple wrapper class that reads GeoJSON features into a
-    :class:`.DataFrame`. Any operations on the data are thus done with
-    methods provided by the data frame class. Geometry is available in the
-    "geometry" column, but no special geometric operations are supported.
+    :class:`.DataFrame`. Any operations on the data are thus done with methods
+    provided by the data frame class. Geometry is available in the "geometry"
+    column, but no special geometric operations are supported.
 
     All other data is available in the "metadata" attribute as an
     ``attd.AttributeDict``.
@@ -88,12 +88,9 @@ def read(cls, path, *, encoding="utf-8", columns=[], dtypes={}, **kwargs):
         Return data from GeoJSON file `path`.
 
         Will automatically decompress if `path` ends in ``.bz2|.gz|.xz``.
-
-        `columns` is an optional list of columns to limit to.
-
-        `dtypes` is an optional dict mapping column names to NumPy datatypes.
-
-        `kwargs` are passed to ``json.load``.
+        `columns` is an optional list of columns to limit to. `dtypes` is an
+        optional dict mapping column names to NumPy datatypes. `kwargs` are
+        passed to ``json.load``.
         """
         with util.xopen(path, "rt", encoding=encoding) as f:
             raw = AttributeDict(json.load(f, **kwargs))
@@ -137,7 +134,6 @@ def write(self, path, *, encoding="utf-8", **kwargs):
         Write data to GeoJSON file `path`.
 
         Will automatically compress if `path` ends in ``.bz2|.gz|.xz``.
-
         `kwargs` are passed to ``json.dump``.
         """
         kwargs.setdefault("default", str)

diff --git a/dataiter/util.py b/dataiter/util.py
@@ -45,11 +45,9 @@ def count_digits(value):
     return n, m
 
 def format_alias_doc(alias, target):
-    indent = " " * 8
-    note = (
+    return f"{target.__doc__}\n\n{' '*8}" + (
         ".. note:: :func:`{}` is a convenience alias for :meth:`{}`."
         .format(alias.__name__, target.__qualname__))
-    return target.__doc__ + "\n\n" + indent + note
 
 def format_floats(seq, ksep=None):
     precision = dataiter.PRINT_FLOAT_PRECISION

diff --git a/dataiter/vector.py b/dataiter/vector.py
@@ -442,7 +442,9 @@ def _np_array(cls, object, dtype=None):
         # NumPy still defaults to fixed width strings.
         # In some cases we can only fix the dtype ex-post.
         if dtype is None:
-            if util.unique_types(object) == {str}:
+            if object and (
+                isinstance(object[0], str) and
+                isinstance(object[-1], str)):
                 dtype = dtypes.string
         dtype = cls._map_input_dtype(dtype)
         array = np.array(object, dtype)

diff --git a/doc/aggregation.rst b/doc/aggregation.rst
@@ -130,7 +130,7 @@ faster as there's no more need to load anything.
           ``di.USE_NUMBA`` to see if Numba has been found. You can also
           set ``di.USE_NUMBA = False`` if you have Numba installed, but
           it's not working right, or via the environment variable
-          ``DATAITER_USE_NUMBA=false``. Sometimes it's the just the
+          ``DATAITER_USE_NUMBA=false``. Sometimes it's just the
           `caching
           <https://numba.readthedocs.io/en/stable/developer/caching.html>`_
           part of Numba that's causing issues. When upgrading you might

diff --git a/doc/check.py b/doc/check.py
@@ -14,6 +14,7 @@
     "data-frame.rst": di.DataFrame,
     "data-frame-column.rst": di.DataFrameColumn,
     "dt.rst": di.dt,
+    "dtypes.rst": di.dtypes,
     "geojson.rst": di.GeoJSON,
     "list-of-dicts.rst": di.ListOfDicts,
     "vector.rst": di.Vector,

diff --git a/doc/comparison/blocks/rbind-pandas.py b/doc/comparison/blocks/rbind-pandas.py
@@ -1,2 +1 @@
 pd.concat([data1, data2])
-data1.append(data2)
diff --git a/doc/comparison/index.html b/doc/comparison/index.html
@@ -162,8 +162,7 @@ <h2>Concatenation</h2>
       <pre data-src="blocks/rbind-pandas.py"></pre>
       <p class="note"></p>
       <p class="note"></p>
-      <p class="note"><code>append</code>
-        is <a href="https://github.com/pandas-dev/pandas/issues/35407">deprecated</a>.</p>
+      <p class="note"></p>
     </div>
     <div class="grid-row" search-terms="cbind bind cols concat">
       <pre data-src="blocks/cbind-dplyr.R"></pre>