-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfake_data_source.py
More file actions
121 lines (98 loc) · 3.56 KB
/
fake_data_source.py
File metadata and controls
121 lines (98 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from pyspark.sql.datasource import DataSource, DataSourceReader, InputPartition
from pyspark.sql.types import StructType
class RangePartition(InputPartition):
def __init__(self, start, end):
self.start = start
self.end = end
class FakeDataSourceReader(DataSourceReader):
def __init__(self, schema, options):
self.schema: StructType = schema
self.options = options
print(f"options: {options}")
def partitions(self):
length = int(self.options.get("length", 0))
partitions = int(self.options.get("partitions", 0))
return [
RangePartition(
i * int(length / partitions), (i + 1) * int(length / partitions)
)
for i in range(partitions)
]
def read(self, partition):
# Library imports must be within the method.
import datetime
import random
from faker import Faker
fake = Faker()
# Every value in this `self.options` dictionary is a string.
# for client in clients:
card_operations = []
print(f"{partition.start} {partition.end}")
for rowid in range(int(partition.start), int(partition.end)):
transaction_id = fake.unique.uuid4()
start_date = datetime.datetime.strptime("2000-01-01", "%Y-%m-%d")
client_id = random.randint(1, 2000)
tran_date = fake.date_between(
start_date=start_date, end_date=datetime.timedelta(days=1)
)
operation = (
str(transaction_id),
str(client_id),
str(fake.random_int(min=1, max=100000) / 10),
tran_date.strftime("%Y-%m-%d"),
fake.company(),
random.choice(
[
"approved",
"approved",
"approved",
"approved",
"approved",
"approved",
"approved",
"approved",
"approved",
"approved",
"declined",
]
),
)
yield operation
from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql.types import StructType
class FakeDataSource(DataSource):
"""
An example data source for batch query using the `faker` library.
"""
@classmethod
def name(cls):
return "fake"
def schema(self):
return "transaction_id string, card_id string, transaction_amount string, transaction_date string, merchant string, status string"
def reader(self, schema: StructType):
return FakeDataSourceReader(schema, self.options)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
def create_spark_session(app_name="PySpark Example Application"):
"""
Creates and returns a Spark Session
"""
return (
SparkSession.builder.appName(app_name)
.master("local[*]")
.config("spark.memory.offHeap.enabled", "true")
.config("spark.memory.offHeap.size", "1g")
.getOrCreate()
)
if __name__ == "__main__":
print("Fake data generator")
spark = create_spark_session()
spark.dataSource.register(FakeDataSource)
df = (
spark.read.format("fake")
.option("length", "1000")
.option("partitions", "5")
.load("/dev/faker")
)
df.limit(10).show()
df.write.format("csv").mode("overwrite").save("./fakedata-02")