Skip to content

Commit 3ec3a48

Browse files
Merge pull request #10 from RumbleDB/Improvements
Improvements
2 parents 0225d19 + bcdeb70 commit 3ec3a48

7 files changed

Lines changed: 98 additions & 16 deletions

File tree

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,13 @@ Even more queries can be found [here](https://colab.research.google.com/github/R
349349

350350
# Latest updates
351351

352+
## Version 0.3.0 alpha 3
353+
- Added parameters to the jsoniq magic to select the desired output to print: -j, -df, -pdf
354+
- Added informative error message with a hint on how to fix when trying to get a DataFrame and there is no schema.
355+
- Added parameter -t to the jsoniq magic to measure the response time
356+
- The RumbleSession object now saves the latest result (sequence of items) in a field called lastResult. This is particularly useful in notebooks for post-processing a result in Python after obtained it through the jsoniq magic.
357+
- Improved static type detection upon binding a pandas or pyspark DataFrame as an input variable to a JSONiq queries.
358+
352359
## Version 0.2.0 alpha 2
353360
- You can change the result size cap through to the now accessible Rumble configuration (for example rumble .getRumbleConf().setResultSizeCap(10)). This controls how many items can be retrieved at most with a json() call. You can increase it to whichever number you would like if you reach the cap.
354361
- Add the JSONiq magic to execute JSONiq queries directly in a notebook cell, using the RumbleDB instance shipped with the library.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "jsoniq"
7-
version = "0.2.0a2"
7+
version = "0.2.0a3"
88
description = "Python edition of RumbleDB, a JSONiq engine"
99
requires-python = ">=3.11"
1010
dependencies = [
380 Bytes
Binary file not shown.

src/jsoniq/sequence.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,30 @@
22
from pyspark.sql import SparkSession
33
from pyspark.sql import DataFrame
44
import json
5+
import sys
56

67
class SequenceOfItems:
8+
schema_str = """
9+
No DataFrame available as no schema was automatically detected. If you still believe the output is structured enough, you could add a schema and validate expression explicitly to your query.
10+
11+
This is an example of how you can simply define a schema and wrap your query in a validate expression:
12+
13+
declare type local:mytype as {
14+
"product" : "string",
15+
"store-number" : "int",
16+
"quantity" : "decimal"
17+
};
18+
validate type local:mytype* {
19+
for $product in json-lines("http://rumbledb.org/samples/products-small.json", 10)
20+
where $product.quantity ge 995
21+
return $product
22+
}
23+
24+
RumbleDB keeps getting improved and automatic schema detection will improve as new versions get released. But even when RumbleDB fails to detect a schema, you can always declare your own schema as shown above.
25+
26+
For more information, see the documentation at https://docs.rumbledb.org/rumbledb-reference/types
27+
"""
28+
729
def __init__(self, sequence, rumblesession):
830
self._jsequence = sequence
931
self._rumblesession = rumblesession
@@ -28,9 +50,15 @@ def rdd(self):
2850
return rdd.map(lambda l: json.loads(l))
2951

3052
def df(self):
53+
if (not "DataFrame" in self._jsequence.availableOutputs()):
54+
sys.stderr.write(self.schema_str)
55+
return None
3156
return DataFrame(self._jsequence.getAsDataFrame(), self._sparksession)
3257

3358
def pdf(self):
59+
if (not "DataFrame" in self._jsequence.availableOutputs()):
60+
sys.stderr.write(self.schema_str)
61+
return None
3462
return self.df().toPandas()
3563

3664
def count(self):

src/jsoniq/session.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,8 @@ def bindDataFrameAsVariable(self, name: str, df):
169169

170170
def jsoniq(self, str):
171171
sequence = self._jrumblesession.runQuery(str);
172-
return SequenceOfItems(sequence, self);
172+
self.lastResult = SequenceOfItems(sequence, self);
173+
return self.lastResult;
173174

174175
def __getattr__(self, item):
175176
return getattr(self._sparksession, item)

src/jsoniqmagic/magic.py

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,36 @@
11
from IPython.core.magic import Magics, cell_magic, magics_class
2+
from IPython.core.magic_arguments import (
3+
argument, magic_arguments, parse_argstring
4+
)
25
import time, json
36
from jsoniq.session import RumbleSession
47
from py4j.protocol import Py4JJavaError
58

69
@magics_class
710
class JSONiqMagic(Magics):
11+
@magic_arguments()
12+
@argument(
13+
'-t', '--timed', action='store_true', help='Measure execution time.'
14+
)
15+
@argument(
16+
'-df', '--pyspark-data-frame', action='store_true', help='Prints the output as a Pyspark DataFrame (if a schema is available).'
17+
)
18+
@argument(
19+
'-pdf', '--pandas-data-frame', action='store_true', help='Prints the output as a Pandas DataFrame (if a schema is available).'
20+
)
21+
@argument(
22+
'-j', '--json', action='store_true', help='Prints the output as JSON.'
23+
)
24+
@argument(
25+
'-u', '--apply-updates', action='store_true', help='Applies updates if a PUL is output.'
26+
)
827
def run(self, line, cell=None, timed=False):
928
if cell is None:
1029
data = line
1130
else:
1231
data = cell
1332

33+
args = parse_argstring(self.run, line)
1434
start = time.time()
1535
try:
1636
rumble = RumbleSession.builder.getOrCreate();
@@ -28,28 +48,53 @@ def run(self, line, cell=None, timed=False):
2848
print("Query unsuccessful.")
2949
print("Usual reasons: firewall, misconfigured proxy.")
3050
return
31-
end = time.time()
32-
if(timed):
33-
print("Response time: %s ms" % (end - start))
3451

35-
if ("DataFrame" in response.availableOutputs()):
36-
print(response.pdf())
37-
elif ("Local" in response.availableOutputs()):
52+
schema_str = """
53+
No DataFrame available as no schema was detected. If you still believe the output is structured enough, you could add a schema and validate expression explicitly to your query.
54+
55+
This is an example of how you can simply define a schema and wrap your query in a validate expression:
56+
57+
declare type mytype as {
58+
"product" : "string",
59+
"store-number" : "int",
60+
"quantity" : "decimal"
61+
};
62+
validate type mytype* {
63+
for $product in json-lines("http://rumbledb.org/samples/products-small.json", 10)
64+
where $product.quantity ge 995
65+
return $product
66+
}
67+
"""
68+
69+
if(args.pyspark_data_frame):
70+
df = response.df();
71+
if df is not None:
72+
df.show()
73+
74+
if (args.pandas_data_frame):
75+
pdf = response.pdf()
76+
if pdf is not None:
77+
print(pdf)
78+
79+
if (args.apply_updates):
80+
if ("PUL" in response.availableOutputs()):
81+
response.applyPUL()
82+
print("Updates applied successfully.")
83+
else:
84+
print("No Pending Update List (PUL) available to apply.")
85+
86+
if (args.json or (not args.pandas_data_frame and not args.pyspark_data_frame)):
3887
capplusone = response.take(rumble.getRumbleConf().getResultSizeCap() + 1)
3988
if len(capplusone) > rumble.getRumbleConf().getResultSizeCap():
4089
count = response.count()
4190
print("The query output %s items, which is too many to display. Displaying the first %s items:" % (count, rumble.getRumbleConf().getResultSizeCap()))
4291
for e in capplusone[:rumble.getRumbleConf().getResultSizeCap()]:
4392
print(json.dumps(json.loads(e.serializeAsJSON()), indent=2))
44-
elif ("PUL" in response.availableOutputs()):
45-
print("The query output a Pending Update List.")
46-
else:
47-
print("No output available.")
93+
94+
end = time.time()
95+
if(args.timed):
96+
print("Response time: %s ms" % (end - start))
4897

4998
@cell_magic
5099
def jsoniq(self, line, cell=None):
51100
return self.run(line, cell, False)
52-
53-
@cell_magic
54-
def timedjsoniq(self, line, cell=None):
55-
return self.run(line, cell, True)

tests/test_sample.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def test1(self):
3232
# Generally though, the results may contain zero, one, two, or more items.
3333
python_list = items.json()
3434
print(python_list)
35+
self.assertTrue(json.dumps(python_list) == json.dumps((2,)))
3536

3637
############################################
3738
##### More complex, standalone queries #####

0 commit comments

Comments
 (0)