Skip to content

Commit 7bdc2e5

Browse files
committed
fix some setup issues
1 parent 3d394d6 commit 7bdc2e5

2 files changed

Lines changed: 93 additions & 24 deletions

File tree

README.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ flowchart LR
5252

5353
- Python 3.9+
5454
- Apache Spark 3.5.0+
55-
- Java 17+
55+
- Java 17 (Java 21+ has known compatibility issues with Spark 3.5)
5656

5757
### Step 1: Download Deequ Pre-release JAR
5858

@@ -111,6 +111,9 @@ Install the beta wheel directly from the GitHub release:
111111
```bash
112112
pip install https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/pydeequ-2.0.0b1-py3-none-any.whl
113113
pip install pyspark[connect]==3.5.0
114+
115+
# Python 3.12+ users: install setuptools (provides distutils removed in 3.12)
116+
pip install setuptools
114117
```
115118

116119
### Step 5: Run Your First Check
@@ -255,7 +258,7 @@ check.hasCompleteness("col", gte(0.9))
255258
## PyDeequ 2.0 Troubleshooting
256259

257260
### Server won't start
258-
1. Check Java version: `java -version` (should be Java 17+)
261+
1. Check Java version: `java -version` (must be Java 17, not 21+)
259262
2. Check port availability: `lsof -i :15002`
260263
3. Check logs: `tail -f $SPARK_HOME/logs/spark-*-SparkConnectServer-*.out`
261264

@@ -268,6 +271,18 @@ ps aux | grep SparkConnectServer
268271
### ClassNotFoundException: DeequRelationPlugin
269272
Ensure the Deequ JAR is correctly specified in `--jars` when starting the server.
270273

274+
### UnsupportedOperationException: sun.misc.Unsafe not available
275+
This error occurs when using Java 21+ with Spark 3.5. Use Java 17 instead:
276+
```bash
277+
export JAVA_HOME=/path/to/java17
278+
```
279+
280+
### ModuleNotFoundError: No module named 'distutils'
281+
This occurs on Python 3.12+ because `distutils` was removed. Install setuptools:
282+
```bash
283+
pip install setuptools
284+
```
285+
271286
---
272287

273288
## PyDeequ 1.x (Legacy)

pydeequ/__init__.py

Lines changed: 76 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,35 +11,89 @@
1111
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1212
# ANY KIND, either express or implied. See the License for the specific
1313
# language governing permissions and limitations under the License.
14-
"""Placeholder docstrings"""
14+
"""
15+
PyDeequ - Python API for Deequ data quality library.
16+
17+
For PyDeequ 2.0 (Spark Connect), use:
18+
from pydeequ.v2 import VerificationSuite, Check, CheckLevel
19+
from pydeequ.v2.predicates import eq, gte
20+
21+
For PyDeequ 1.x (Legacy Py4J), set SPARK_VERSION env var and use:
22+
from pydeequ import deequ_maven_coord
23+
from pydeequ.checks import Check, CheckLevel
24+
"""
1525
__version__ = "2.0.0b1"
1626

17-
from pyspark.sql import SparkSession
27+
# Legacy imports are deferred to avoid requiring SPARK_VERSION for V2 users.
28+
# V2 users should import from pydeequ.v2 directly.
29+
30+
_deequ_maven_coord = None
31+
_f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"
32+
33+
34+
def __getattr__(name):
35+
"""Lazy loading for legacy module attributes."""
36+
global _deequ_maven_coord
37+
38+
if name == "deequ_maven_coord":
39+
if _deequ_maven_coord is None:
40+
from pydeequ.configs import DEEQU_MAVEN_COORD
41+
_deequ_maven_coord = DEEQU_MAVEN_COORD
42+
return _deequ_maven_coord
43+
44+
if name == "f2j_maven_coord":
45+
return _f2j_maven_coord
46+
47+
if name in ("AnalysisRunner", "Check", "CheckLevel", "ColumnProfilerRunner",
48+
"PyDeequSession", "DEEQU_MAVEN_COORD"):
49+
# Import legacy modules on demand
50+
if name == "AnalysisRunner":
51+
from pydeequ.analyzers import AnalysisRunner
52+
return AnalysisRunner
53+
elif name == "Check":
54+
from pydeequ.checks import Check
55+
return Check
56+
elif name == "CheckLevel":
57+
from pydeequ.checks import CheckLevel
58+
return CheckLevel
59+
elif name == "ColumnProfilerRunner":
60+
from pydeequ.profiles import ColumnProfilerRunner
61+
return ColumnProfilerRunner
62+
elif name == "DEEQU_MAVEN_COORD":
63+
from pydeequ.configs import DEEQU_MAVEN_COORD
64+
return DEEQU_MAVEN_COORD
65+
66+
if name == "PyDeequSession":
67+
# Return the lazily-defined class
68+
return _get_pydeequ_session_class()
69+
70+
raise AttributeError(f"module 'pydeequ' has no attribute '{name}'")
1871

19-
from pydeequ.analyzers import AnalysisRunner
20-
from pydeequ.checks import Check, CheckLevel
21-
from pydeequ.configs import DEEQU_MAVEN_COORD
22-
from pydeequ.profiles import ColumnProfilerRunner
2372

24-
deequ_maven_coord = DEEQU_MAVEN_COORD
25-
f2j_maven_coord = "net.sourceforge.f2j:arpack_combined_all"
73+
def _get_pydeequ_session_class():
74+
"""Lazily create PyDeequSession class to avoid importing SparkSession at module load."""
75+
from pyspark.sql import SparkSession
76+
from pydeequ.analyzers import AnalysisRunner
77+
from pydeequ.checks import Check, CheckLevel
78+
from pydeequ.profiles import ColumnProfilerRunner
2679

80+
class PyDeequSession:
81+
"""
82+
For interacting with PyDeequ Modules at the "Runner" Level
83+
"""
2784

28-
class PyDeequSession:
29-
"""
30-
For interacting with PyDeequ Modules at the "Runner" Level
31-
"""
85+
def __init__(self, spark_session: SparkSession):
86+
self._spark_session = spark_session
87+
self._sc = spark_session.sparkContext
88+
self._jvm = spark_session._jvm
3289

33-
def __init__(self, spark_session: SparkSession):
34-
self._spark_session = spark_session
35-
self._sc = spark_session.sparkContext
36-
self._jvm = spark_session._jvm
90+
def createColumnProfileRunner(self):
91+
return ColumnProfilerRunner(self._spark_session)
3792

38-
def createColumnProfileRunner(self):
39-
return ColumnProfilerRunner(self._spark_session)
93+
def createAnalysisRunner(self):
94+
return AnalysisRunner(self._spark_session)
4095

41-
def createAnalysisRunner(self):
42-
return AnalysisRunner(self._spark_session)
96+
def createCheck(self, level: CheckLevel, description: str, constraints=None):
97+
return Check(self._spark_session, level, description, constraints)
4398

44-
def createCheck(self, level: CheckLevel, description: str, constraints=None):
45-
return Check(self._spark_session, level, description, constraints)
99+
return PyDeequSession

0 commit comments

Comments
 (0)