@@ -52,6 +52,8 @@ pip install jsoniq
5252We will make more documentation available as we go. In the meantime, you will find a sample code below that should just run
5353after installing the library.
5454
55+ You can directly copy paste the code below to a Python file and execute it with Python.
56+
5557```
5658from jsoniq import RumbleSession
5759
@@ -83,13 +85,41 @@ modes = res.availableOutputs();
8385for mode in modes:
8486 print(mode)
8587
86- ###### Parallel access ######
88+ #########################################################
89+ ###### Manipulating DataFrames with SQL and JSONiq ######
90+ #########################################################
8791
88- # This returns a regular data frame that can be further processed with spark.sql() or rumble.jsoniq().
92+ # If the output of the JSONiq query is structured (i.e., RumbleDB was able to detect a schema),
93+ # then we can extract a regular data frame that can be further processed with spark.sql() or rumble.jsoniq().
8994df = res.df();
9095df.show();
9196
97+ # We are continuously working on the detection of schemas and RumbleDB will get better at it with them.
98+ # JSONiq is a very powerful language and can also produce heterogeneous output "by design". Then you need
99+ # to use rdd() instead of df(), or to collect the list of JSON values (see further down). Remember
100+ # that availableOutputs() tells you what is at your disposal.
101+
102+ # A DataFrame output by JSONiq can be reused as input to a Spark SQL query.
103+ # (Remember that rumble is a wrapper around a SparkSession object, so you can use rumble.sql() just like spark.sql())
104+ df.createTempView("input")
105+ df2 = rumble.sql("SELECT * FROM input").toDF("name");
106+ df2.show();
107+
108+ # A DataFrame output by Spark SQL can be reused as input to a JSONiq query.
109+ rumble.bindDataFrameAsVariable('$b', df2);
110+ seq2 = rumble.jsoniq("for $i in 1 to 5 return $b");
111+ df3 = seq2.df();
112+ df3.show();
113+
114+ # And a DataFrame output by JSONiq can be reused as input to another JSONiq query.
115+ rumble.bindDataFrameAsVariable('$b', df3);
116+ seq3 = rumble.jsoniq("$b[position() lt 3]");
117+ df4 = seq3.df();
118+ df4.show();
119+
120+ #########################
92121##### Local access ######
122+ #########################
93123
94124# This materializes the rows as items.
95125# The items are accessed with the RumbleDB Item API.
@@ -103,7 +133,9 @@ while (res.hasNext()):
103133 print(res.next().getStringValue());
104134res.close();
105135
136+ ################################################################################################################
106137###### Native Python/JSON Access for bypassing the Item API (but losing on the richer JSONiq type system) ######
138+ ################################################################################################################
107139
108140# This method directly gets the result as JSON (dict, list, strings, ints, etc).
109141jlist = res.json();
@@ -122,6 +154,10 @@ print(rdd.count());
122154for str in rdd.take(10):
123155 print(str);
124156
157+ ###################################################
158+ ###### Write back to the disk (or data lake) ######
159+ ###################################################
160+
125161# It is also possible to write the output to a file locally or on a cluster. The API is similar to that of Spark dataframes.
126162# Note that it creates a directory and stores the (potentially very large) output in a sharded directory.
127163# RumbleDB was already tested with up to 64 AWS machines and 100s of TBs of data.
@@ -134,7 +170,9 @@ seq.write().mode("overwrite").parquet("outputparquet");
134170seq = rumble.jsoniq("1+1");
135171seq.write().mode("overwrite").text("outputtext");
136172
137- # A more complex, standalone query
173+ ############################################
174+ ##### More complex, standalone queries #####
175+ ############################################
138176
139177seq = rumble.jsoniq("""
140178
@@ -181,9 +219,15 @@ return {
181219print(seq.json());
182220
183221```
222+ # How to learn JSONiq, and more query examples
223+
224+ Even more queries can be found [ here] ( https://colab.research.google.com/github/RumbleDB/rumble/blob/master/RumbleSandbox.ipynb ) and you can look at the [ JSONiq documentation] ( https://www.jsoniq.org ) and tutorials.
184225
185226# Last updates
186227
228+ ## Version 0.1.0 alpha 11
229+ - Fix an issue when feeding a DataFrame output by rumble.jsoniq() back to a new JSONiq query (as a variable).
230+
187231## Version 0.1.0 alpha 10
188232- Add an explicit explanation on stderr if the Java version is not properly set, together with hints.
189233
0 commit comments