Skip to content

Commit a27418a

Browse files
structured-kb-demo/
1 parent b85d031 commit a27418a

15 files changed

Lines changed: 275 additions & 87 deletions

structured-kb-demo/.env.example

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
ACCOUNT_ID=
2+
AWS_ACCESS_KEY_ID=
3+
AWS_SECRET_ACCESS_KEY=
4+
BUCKET=
5+
CRAWLER_IAM_POLICY=
6+
CRAWLER_IAM_ROLE=
7+
CRAWLER=
8+
DB=
9+
FOLDER=
10+
QUEUE=
11+
REGION=

structured-kb-demo/arns.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from vars import (
2+
ACCOUNT_ID,
3+
BUCKET,
4+
CRAWLER_IAM_POLICY,
5+
CRAWLER_IAM_ROLE,
6+
REDSHIFT_IAM_ROLE,
7+
REGION,
8+
QUEUE,
9+
)
10+
11+
AWS_MANAGED_GLUE_IAM_POLICY_ARN = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
12+
AWS_MANAGED_REDSHIFT_IAM_POLICY_ARN = "arn:aws:iam::aws:policy/AmazonRedshiftAllCommandsFullAccess"
13+
BUCKET_ARN = f"arn:aws:s3:::{BUCKET}"
14+
CRAWLER_IAM_POLICY_ARN = f"arn:aws:iam::{ACCOUNT_ID}:policy/{CRAWLER_IAM_POLICY}"
15+
CRAWLER_IAM_ROLE_ARN = f"arn:aws:iam::{ACCOUNT_ID}:role/{CRAWLER_IAM_ROLE}"
16+
QUEUE_ARN = f"arn:aws:sqs:{REGION}:{ACCOUNT_ID}:{QUEUE}"
17+
REDSHIFT_IAM_ROLE_ARN = f"arn:aws:iam::{ACCOUNT_ID}:role/{REDSHIFT_IAM_ROLE}"

structured-kb-demo/create_s3_bucket.py

Lines changed: 0 additions & 18 deletions
This file was deleted.
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import time
2+
3+
import boto3
4+
5+
from logger import logger
6+
from vars import CRAWLER, REGION
7+
8+
def run_glue_crawler(crawler_name):
9+
glue = boto3.client("glue", region_name=REGION)
10+
11+
try:
12+
glue.start_crawler(Name=crawler_name)
13+
logger.info(f"Crawler started.")
14+
15+
while True:
16+
response = glue.get_crawler(Name=crawler_name)
17+
status = response["Crawler"]["State"]
18+
19+
if status == "RUNNING":
20+
logger.info("Crawler is still running...")
21+
time.sleep(30) # Wait 30 seconds before checking again
22+
elif status == "STOPPING":
23+
logger.info("Crawler is stopping...")
24+
time.sleep(10)
25+
else:
26+
logger.info(f"Crawler finished. Final State: {status}")
27+
break
28+
29+
except glue.exceptions.CrawlerRunningException:
30+
logger.warning(f"Crawler is already running.")
31+
except Exception as e:
32+
logger.error(f"Error: {str(e)}")
33+
34+
run_glue_crawler(CRAWLER)

structured-kb-demo/setup_event_notification.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import boto3
22

3+
from arns import QUEUE_ARN
34
from logger import logger
4-
from vars import BUCKET_NAME, QUEUE_ARN
5+
from vars import BUCKET
56

67
s3 = boto3.client("s3")
78

@@ -19,9 +20,9 @@
1920

2021
try:
2122
s3.put_bucket_notification_configuration(
22-
Bucket=BUCKET_NAME,
23+
Bucket=BUCKET,
2324
NotificationConfiguration=notification_configuration
2425
)
25-
print(f"Successfully added event notifications to {BUCKET_NAME}")
26+
print(f"Successfully added event notifications")
2627
except Exception as e:
2728
print(f"Error: {e}")
Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,37 @@
11
import boto3
22

3-
from vars import ACCOUNT_ID, BUCKET_NAME, DB_NAME, GLUE_CRAWLER_IAM_ROLE, GLUE_CRAWLER_NAME, QUEUE_NAME, REGION
3+
from arns import CRAWLER_IAM_ROLE_ARN, QUEUE_ARN
4+
from logger import logger
5+
from vars import BUCKET, CRAWLER, DB, FOLDER
46

5-
CRAWLER_IAM_ROLE_ARN = f"arn:aws:iam::842960110593:role/service-role/{GLUE_CRAWLER_IAM_ROLE}"
6-
S3_PATH = f"s3://{BUCKET_NAME}/"
7-
SQS_ARN = f"arn:aws:sqs:{REGION}:{ACCOUNT_ID}:{QUEUE_NAME}"
7+
S3_PATH = f"s3://{BUCKET}/{FOLDER}"
88

99
glue = boto3.client("glue", region_name="us-east-1")
1010

11-
try:
12-
response = glue.create_crawler(
13-
Name=GLUE_CRAWLER_NAME,
14-
Role=CRAWLER_IAM_ROLE_ARN,
15-
DatabaseName=DB_NAME,
16-
Description="Crawler for inventory data triggered by SQS events",
17-
Targets={
18-
"S3Targets": [
19-
{
20-
"Path": S3_PATH,
21-
"EventQueueArn": SQS_ARN # Enables S3 event-aware crawling
22-
}
23-
]
24-
},
25-
# "On-demand" means we don"t provide a Cron schedule
26-
SchemaChangePolicy={
27-
"UpdateBehavior": "UPDATE_IN_DATABASE",
28-
"DeleteBehavior": "DELETE_FROM_DATABASE"
29-
},
30-
RecrawlPolicy={
31-
"RecrawlBehavior": "CRAWL_EVENT_MODE" # Processes only events from SQS
32-
}
33-
)
34-
print(f"Crawler created successfully.")
11+
CRAWLER_CONFIG = {
12+
"Role": CRAWLER_IAM_ROLE_ARN,
13+
"DatabaseName": DB,
14+
"Description": "Crawler for inventory data triggered by SQS events",
15+
"Targets": {
16+
"S3Targets": [
17+
{
18+
"Path": S3_PATH,
19+
"EventQueueArn": QUEUE_ARN,
20+
}
21+
]
22+
},
23+
"SchemaChangePolicy": {
24+
"UpdateBehavior": "UPDATE_IN_DATABASE",
25+
"DeleteBehavior": "DELETE_FROM_DATABASE",
26+
},
27+
"RecrawlPolicy": {"RecrawlBehavior": "CRAWL_EVENT_MODE"},
28+
}
3529

30+
try:
31+
glue.create_crawler(Name=CRAWLER, **CRAWLER_CONFIG)
32+
logger.info("Crawler created successfully.")
3633
except glue.exceptions.AlreadyExistsException:
37-
print(f"Crawler already exists.")
34+
glue.update_crawler(Name=CRAWLER, **CRAWLER_CONFIG)
35+
logger.info("Crawler already exists. Updated configuration successfully.")
3836
except Exception as e:
39-
print(f"Error creating crawler: {e}")
37+
logger.error(f"Error creating crawler: {e}")

structured-kb-demo/setup_glue_crawler_iam_policy.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
import json
44

5+
from arns import BUCKET_ARN, CRAWLER_IAM_POLICY_ARN, QUEUE_ARN
6+
57
from logger import logger
6-
from vars import BUCKET_ARN, GLUE_CRAWLER_IAM_POLICY
8+
from vars import CRAWLER_IAM_POLICY
79

810
iam = boto3.client('iam')
911

@@ -23,24 +25,38 @@
2325
{
2426
"Effect": "Allow",
2527
"Action": [
26-
"sqs:ReceiveMessage",
2728
"sqs:DeleteMessage",
28-
"sqs:GetQueueAttributes"
29+
"sqs:GetQueueAttributes",
30+
"sqs:GetQueueUrl",
31+
"sqs:PurgeQueue",
32+
"sqs:ReceiveMessage",
33+
"sqs:SetQueueAttributes"
2934
],
30-
"Resource": "arn:aws:sqs:us-east-1:842960110593:structured-kb-demo-queue"
35+
"Resource": QUEUE_ARN
3136
},
3237
]
3338
}
3439

3540
try:
3641
iam.create_policy(
37-
PolicyName=GLUE_CRAWLER_IAM_POLICY,
42+
PolicyName=CRAWLER_IAM_POLICY,
3843
PolicyDocument=json.dumps(policy_document),
3944
Description='Permissions for Glue Crawler to crawl S3 and use SQS Events'
4045
)
4146
logger.info(f"Policy created successfully!")
4247

4348
except iam.exceptions.EntityAlreadyExistsException:
44-
logger.error(f"Policy already exists.")
49+
logger.info(f"Policy already exists. Updating...")
50+
try:
51+
# Create a new policy version
52+
iam.create_policy_version(
53+
PolicyArn=CRAWLER_IAM_POLICY_ARN,
54+
PolicyDocument=json.dumps(policy_document),
55+
SetAsDefault=True
56+
)
57+
logger.info(f"Policy updated successfully!")
58+
except Exception as e:
59+
logger.error(f"Error updating policy: {e}")
60+
4561
except Exception as e:
4662
logger.error(f"Error: {e}")

structured-kb-demo/setup_glue_crawler_iam_role.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
import json
44

5+
from arns import AWS_MANAGED_GLUE_IAM_POLICY_ARN, CRAWLER_IAM_POLICY_ARN
56
from logger import logger
6-
from vars import ACCOUNT_ID, GLUE_CRAWLER_IAM_POLICY, GLUE_CRAWLER_IAM_ROLE
7+
from vars import ACCOUNT_ID, CRAWLER_IAM_ROLE
78

89
iam = boto3.client("iam")
910

@@ -27,33 +28,31 @@
2728

2829
try:
2930
iam.create_role(
30-
RoleName=GLUE_CRAWLER_IAM_ROLE,
31+
RoleName=CRAWLER_IAM_ROLE,
3132
AssumeRolePolicyDocument=json.dumps(trust_policy)
3233
)
3334
logger.info(f"Created role")
3435
except iam.exceptions.EntityAlreadyExistsException:
35-
logger.info(f"Role {GLUE_CRAWLER_IAM_ROLE} already exists.")
36+
logger.info(f"Role already exists.")
3637

3738

3839
# Get existing attached policies
39-
response = iam.list_attached_role_policies(RoleName=GLUE_CRAWLER_IAM_ROLE)
40+
response = iam.list_attached_role_policies(RoleName=CRAWLER_IAM_ROLE)
4041
attached_policies = [p['PolicyArn'] for p in response.get('AttachedPolicies', [])]
4142

42-
aws_glue_policy = "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole"
43-
if aws_glue_policy not in attached_policies:
43+
if AWS_MANAGED_GLUE_IAM_POLICY_ARN not in attached_policies:
4444
iam.attach_role_policy(
45-
RoleName=GLUE_CRAWLER_IAM_ROLE,
46-
PolicyArn=aws_glue_policy
45+
RoleName=CRAWLER_IAM_ROLE,
46+
PolicyArn=AWS_MANAGED_GLUE_IAM_POLICY_ARN
4747
)
4848
logger.info("AWS Glue Service Role policy attached.")
4949
else:
5050
logger.info("AWS Glue Service Role policy already attached.")
5151

52-
custom_policy = f"arn:aws:iam::{ACCOUNT_ID}:policy/{GLUE_CRAWLER_IAM_POLICY}"
53-
if custom_policy not in attached_policies:
52+
if CRAWLER_IAM_POLICY_ARN not in attached_policies:
5453
iam.attach_role_policy(
55-
RoleName=GLUE_CRAWLER_IAM_ROLE,
56-
PolicyArn=custom_policy
54+
RoleName=CRAWLER_IAM_ROLE,
55+
PolicyArn=CRAWLER_IAM_POLICY_ARN
5756
)
5857
logger.info("Custom Glue Crawler policy attached.")
5958
else:
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
'Description': 'Database for Bedrock structured knowledge base demo',
1313
}
1414
)
15-
logger.info(f"Glue database '{DB_NAME}' created successfully.")
15+
logger.info(f"Glue database created successfully.")
1616

1717
except glue.exceptions.AlreadyExistsException:
18-
logger.error(f"Database {DB_NAME} already exists.")
18+
logger.error(f"Database already exists.")
1919
except Exception as e:
2020
logger.error(f"Error creating Glue database: {e}")
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import boto3
2+
import json
3+
4+
from arns import AWS_MANAGED_REDSHIFT_IAM_POLICY_ARN
5+
from logger import logger
6+
from vars import REDSHIFT_IAM_ROLE, REGION
7+
8+
iam = boto3.client("iam", region_name=REGION)
9+
10+
trust_policy = {
11+
"Version": "2012-10-17",
12+
"Statement": [
13+
{
14+
"Effect": "Allow",
15+
"Principal": {
16+
"Service": [
17+
"redshift-serverless.amazonaws.com",
18+
"redshift.amazonaws.com",
19+
"sagemaker.amazonaws.com"
20+
]
21+
},
22+
"Action": "sts:AssumeRole"
23+
}
24+
]
25+
}
26+
27+
try:
28+
iam.create_role(
29+
RoleName=REDSHIFT_IAM_ROLE,
30+
AssumeRolePolicyDocument=json.dumps(trust_policy),
31+
Description="Role for Redshift Serverless to access Glue and S3"
32+
)
33+
logger.info(f"Created role: {REDSHIFT_IAM_ROLE}")
34+
35+
iam.attach_role_policy(
36+
RoleName=REDSHIFT_IAM_ROLE,
37+
PolicyArn=AWS_MANAGED_REDSHIFT_IAM_POLICY_ARN
38+
)
39+
logger.info(f"Attached AmazonRedshiftAllCommandsFullAccess to {REDSHIFT_IAM_ROLE}")
40+
41+
except iam.exceptions.EntityAlreadyExistsException:
42+
logger.warning(f"Role {REDSHIFT_IAM_ROLE} already exists.")
43+
except Exception as e:
44+
logger.error(f"Error: {e}")

0 commit comments

Comments
 (0)