PythonDataSources/fake_data_source.py at main · dmoore247/PythonDataSources · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
from pyspark.sql.types import StructType


class RangePartition(InputPartition):
    def __init__(self, start, end):
        self.start = start
        self.end = end


class FakeDataSourceReader(DataSourceReader):

    def __init__(self, schema, options):
        self.schema: StructType = schema
        self.options = options
        print(f"options: {options}")

    def partitions(self):
        length = int(self.options.get("length", 0))
        partitions = int(self.options.get("partitions", 0))
        return [
            RangePartition(
                i * int(length / partitions), (i + 1) * int(length / partitions)
            )
            for i in range(partitions)
        ]

    def read(self, partition):
        # Library imports must be within the method.
        import datetime
        import random

        from faker import Faker

        fake = Faker()

        # Every value in this `self.options` dictionary is a string.
        # for client in clients:
        card_operations = []
        print(f"{partition.start} {partition.end}")
        for rowid in range(int(partition.start), int(partition.end)):
            transaction_id = fake.unique.uuid4()
            start_date = datetime.datetime.strptime("2000-01-01", "%Y-%m-%d")
            client_id = random.randint(1, 2000)
            tran_date = fake.date_between(
                start_date=start_date, end_date=datetime.timedelta(days=1)
            )
            operation = (
                str(transaction_id),
                str(client_id),
                str(fake.random_int(min=1, max=100000) / 10),
                tran_date.strftime("%Y-%m-%d"),
                fake.company(),
                random.choice(
                    [
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "approved",
                        "declined",
                    ]
                ),
            )
            yield operation


from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql.types import StructType


class FakeDataSource(DataSource):
    """
    An example data source for batch query using the `faker` library.
    """

    @classmethod
    def name(cls):
        return "fake"

    def schema(self):
        return "transaction_id string, card_id string, transaction_amount string, transaction_date string, merchant string, status string"

    def reader(self, schema: StructType):
        return FakeDataSourceReader(schema, self.options)


from pyspark.sql import SparkSession
from pyspark.sql.functions import col


def create_spark_session(app_name="PySpark Example Application"):
    """
    Creates and returns a Spark Session
    """
    return (
        SparkSession.builder.appName(app_name)
        .master("local[*]")
        .config("spark.memory.offHeap.enabled", "true")
        .config("spark.memory.offHeap.size", "1g")
        .getOrCreate()
    )


if __name__ == "__main__":
    print("Fake data generator")
    spark = create_spark_session()
    spark.dataSource.register(FakeDataSource)
    df = (
        spark.read.format("fake")
        .option("length", "1000")
        .option("partitions", "5")
        .load("/dev/faker")
    )
    df.limit(10).show()
    df.write.format("csv").mode("overwrite").save("./fakedata-02")