How to do context-prompting for generating a column in synthetic dataset? #15453

Achmad96 · 2026-02-28T13:44:09Z

Achmad96
Feb 28, 2026

Hi, I'm working on designing a synthetic transaction dataset and struggling with how to set the fraud rate to only 0.10% across the dataset. I think the solution is how I implement context-prompting so the models can learn from the previous row.

config_builder.add_column(
    SamplerColumnConfig(
        name="timestamp",
        sampler_type=SamplerType.DATETIME,
        params=DatetimeSamplerParams(start="2021-1-1", end="2026-1-1"),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="transaction_id",
        sampler_type=SamplerType.UUID,
        params=UUIDSamplerParams(short_form=True),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="customer_id",
        sampler_type=SamplerType.UUID,
        params=UUIDSamplerParams(short_form=False),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="transaction_total",
        sampler_type=SamplerType.UNIFORM,
        params=UniformSamplerParams(low=1000, high=10_000_000, decimal_places=0),
        convert_to="int",
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="merchant_category",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["Electronics", "Clothing", "Groceries", "Home & Garden", "Sports"],
            weights=[0.2, 0.3, 0.25, 0.15, 0.1],
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="payment_method",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["Credit Card", "Debit Card", "PayPal", "Bank Transfer"],
            weights=[0.3, 0.3, 0.15, 0.25],
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="region",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["Jakarta", "Surabaya", "Bandung", "Medan", "Semarang"],
            weights=[0.3, 0.25, 0.2, 0.15, 0.1],
        ),
    )
)

config_builder.add_column(
    SamplerColumnConfig(
        name="total_quantity",
        sampler_type=SamplerType.UNIFORM,
        params=UniformSamplerParams(low=1, high=100, decimal_places=0),
        convert_to="int",
    )
)

config_builder.add_column(
    LLMStructuredColumnConfig(
        name="structured_transaction",
        model_alias=structure_model_alias,
        prompt=(
            "Generate a realistic value for total_quantity_7d based on the ratio of the {{total_quantity}} over the value. The chance that the ratio is greater than 2 is 10%, otherwise the possibility that it is less than 2 is 90%.\n"
            "Generate a realistic value for reported_transaction_total. The value should be equal to {{transaction_total}} for 90%% of the transactions and be lower for the remaining 10%. The value should be a random value between 1000 and 1000,000. If the {{total_quantity}}/total_quantity_7d is greater than 2, then the value of the reported_transaction_total should be less than {{transaction_total}}.\n"
        ),
        output_format=Transaction,
    )
)

config_builder.add_column(
    ExpressionColumnConfig(
        name="quantity_spike_ratio",
        expr="{{ total_quantity / structured_transaction.total_quantity_7d }}",
        dtype="float",
    )
)

config_builder.add_column(
    ExpressionColumnConfig(
        name="transaction_total_delta",
        expr="{{ structured_transaction.reported_transaction_total - transaction_total }}",
    )
)

config_builder.add_column(
    LLMJudgeColumnConfig(
        name="fraud_analysis",
        model_alias=judge_model_alias,
        prompt=(
            "Determine if the transaction is fraudulent based on the following conditions:\n"
            "1. If the {{transaction_total_delta}} is not equal to 0, it indicates a discrepancy between the expected and reported transaction totals, which is a strong indicator of potential fraud.\n"
            "2. If the {{quantity_spike_ratio}} is greater than or equal to 2.0, it suggests a significant increase in quantity compared to the average, which could also indicate fraudulent activity.\n"
            "If either of these conditions is met, classify the transaction as fraudulent (True). Otherwise, classify it as non-fraudulent (False)."
        ),
        scores=[
            Score(
                name="is_fraud",
                description="Indicates whether the transaction is fraudulent (1) or not (0).",
                options={
                    0: "The transaction is not fraud",
                    1: "The transaction is fraud",
                },
            )
        ],
    )
)

config_builder.validate()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to do context-prompting for generating a column in synthetic dataset? #15453

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

How to do context-prompting for generating a column in synthetic dataset? #15453

Uh oh!

Uh oh!

Achmad96 Feb 28, 2026

Replies: 0 comments

Achmad96
Feb 28, 2026