Merge pull request apache#25916 Use built-in csv and json readers.

robertwb · web-flow · commit d80f1ff499a9 · 2023-03-24T10:51:51.000-07:00
diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py
@@ -314,22 +314,6 @@ def expand(self, pcolls):
       for key in dir(apache_beam.io)
       if key.startswith('ReadFrom') or key.startswith('WriteTo')
   }
-  ios['ReadFromCsv'] = lambda **kwargs: apache_beam.dataframe.io.ReadViaPandas(
-      'csv', **kwargs)
-  ios['WriteToCsv'] = lambda **kwargs: apache_beam.dataframe.io.WriteViaPandas(
-      'csv', **kwargs)
-  ios['ReadFromJson'] = (
-      lambda *,
-      orient='records',
-      lines=True,
-      **kwargs: apache_beam.dataframe.io.ReadViaPandas(
-          'json', orient=orient, lines=lines, **kwargs))
-  ios['WriteToJson'] = (
-      lambda *,
-      orient='records',
-      lines=True,
-      **kwargs: apache_beam.dataframe.io.WriteViaPandas(
-          'json', orient=orient, lines=lines, **kwargs))
 
   return InlineProvider(
       dict({
diff --git a/sdks/python/apache_beam/yaml/yaml_transform_test.py b/sdks/python/apache_beam/yaml/yaml_transform_test.py
@@ -15,7 +15,10 @@
 # limitations under the License.
 #
 
+import glob
 import logging
+import os
+import tempfile
 import unittest
 
 import apache_beam as beam
@@ -102,6 +105,46 @@ def test_chain_with_root(self):
           ''')
       assert_that(result, equal_to([41, 43, 47, 53, 61, 71, 83, 97, 113, 131]))
 
+  def test_csv_to_json(self):
+    try:
+      import pandas as pd
+    except ImportError:
+      raise unittest.SkipTest('Pandas not available.')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+      data = pd.DataFrame([
+          {
+              'label': '11a', 'rank': 0
+          },
+          {
+              'label': '37a', 'rank': 1
+          },
+          {
+              'label': '389a', 'rank': 2
+          },
+      ])
+      input = os.path.join(tmpdir, 'input.csv')
+      output = os.path.join(tmpdir, 'output.json')
+      data.to_csv(input, index=False)
+
+      with beam.Pipeline() as p:
+        result = p | YamlTransform(
+            '''
+            type: chain
+            transforms:
+              - type: ReadFromCsv
+                path: %s
+              - type: WriteToJson
+                path: %s
+                num_shards: 1
+            ''' % (repr(input), repr(output)))
+
+      output_shard = list(glob.glob(output + "*"))[0]
+      result = pd.read_json(
+          output_shard, orient='records',
+          lines=True).sort_values('rank').reindex()
+      pd.testing.assert_frame_equal(data, result)
+
 
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)