-
💼 DevOps&Cloud Engineer specializing in Infrastructure Automation
-
🔧 Expertise in AWS | Kubernetes | Terraform | Docker | Ansible
-
🚀 Passionate about CI/CD Pipelines and GitOps
-
💰 FinOps - Cloud Cost Optimization Specialist
-
🔐 DevSecOps - Security Integration & Compliance
-
🤖 AIOps & MLOps - Intelligent Operations & ML Pipelines
-
📍 Based in Hyderabad, India 🇮🇳
-
Get in Touch
📧 Email: vishnusai.vks@gmail.com
🔗 LinkedIn: https://www.linkedin.com/in/vishnusai-14107a256/
role: DevOps & Cloud Engineer | XOps Specialist
expertise:
primary:
- 🔧 DevOps Engineering (CI/CD, IaC, Automation)
- 🔐 DevSecOps (Security, Compliance)
- 💰 FinOps (Cloud Cost Management)
- 🤖 AIOps (Intelligent Operations)
- 🧠 MLOps (ML Pipeline Automation)
technical_stack:
cloud: [AWS, Azure, GCP]
containers: [Docker, Kubernetes, Helm]
iac: [Terraform, CloudFormation, Pulumi]
cicd: [Jenkins, GitLab CI, GitHub Actions, ArgoCD]
config_mgmt: [Ansible,]
monitoring: [Prometheus, Grafana, ELK Stack]
scripting: [Python, Bash,]
currently_learning:
- Advanced Kubernetes Patterns & Service Mesh (Istio/Linkerd)
- Platform Engineering & Internal Developer Platforms
- FinOps Best Practices & Cloud Cost Optimization
- MLOps & AI Model Deployment at Scale
- Site Reliability Engineering (SRE)
current_projects:
- Building resilient cloud-native infrastructure
- Implementing GitOps workflows with ArgoCD
- Automating security compliance checks
- Optimizing cloud costs across multi-cloud environments# Deploy applications efficiently using Helm
apiVersion: apps/v1
kind: Deployment
metadata:
name: devops-app
namespace: production
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: devops-app
template:
metadata:
labels:
app: devops-app
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
containers:
- name: app
image: myregistry/devops-app:latest
imagePullPolicy: Always
ports:
- containerPort: 8080
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 500m
memory: 1Gi
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 10
periodSeconds: 5# AWS EKS Cluster with Terraform
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
backend "s3" {
bucket = "terraform-state"
key = "prod/eks/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Environment = var.environment
ManagedBy = "Terraform"
Project = var.project_name
}
}
}
resource "aws_eks_cluster" "main" {
name = var.cluster_name
version = var.kubernetes_version
role_arn = aws_iam_role.eks_cluster.arn
vpc_config {
subnet_ids = concat(aws_subnet.private[*].id, aws_subnet.public[*].id)
endpoint_private_access = true
endpoint_public_access = true
security_group_ids = [aws_security_group.cluster.id]
}
enabled_cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
tags = {
Name = var.cluster_name
}
depends_on = [aws_iam_role_policy_attachment.eks_cluster_policy]
}
resource "aws_autoscaling_group" "nodes" {
name = "${var.cluster_name}-asg"
vpc_zone_identifier = aws_subnet.private[*].id
min_size = var.min_nodes
max_size = var.max_nodes
desired_capacity = var.desired_nodes
health_check_type = "ELB"
health_check_grace_period = 300
tag {
key = "Name"
value = "${var.cluster_name}-node"
propagate_launch_template = true
}
}name: Build, Test & Deploy
on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov
- name: Run tests
run: pytest --cov=. --cov-report=xml
- name: Upload coverage
uses: codecov/codecov-action@v3
with:
files: ./coverage.xml
security:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Trivy vulnerability scan
uses: aquasecurity/trivy-action@master
with:
scan-type: 'fs'
scan-ref: '.'
format: 'sarif'
output: 'trivy-results.sarif'
- name: Upload Trivy results
uses: github/codeql-action/upload-sarif@v2
with:
sarif_file: 'trivy-results.sarif'
build:
needs: [test, security]
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Log in to Container Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=semver,pattern={{version}}
type=sha
- name: Build and push
uses: docker/build-push-action@v4
with:
context: .
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
needs: build
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v4
- name: Deploy to Production
run: |
kubectl set image deployment/app-deployment \
app=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }} \
--namespace=production \
--record
env:
KUBECONFIG: ${{ secrets.KUBECONFIG }}---
- name: Configure Production Servers
hosts: production
become: yes
gather_facts: yes
vars:
docker_version: "latest"
nodejs_version: "18.x"
pre_tasks:
- name: Update system packages
apt:
update_cache: yes
cache_valid_time: 3600
when: ansible_os_family == "Debian"
- name: Install prerequisites
apt:
name:
- curl
- wget
- git
- vim
- htop
state: present
roles:
- geerlingguy.docker
- geerlingguy.nodejs
tasks:
- name: Configure Docker daemon
copy:
content: |
{
"log-driver": "json-file",
"log-opts": {
"max-size": "10m",
"max-file": "3"
},
"metrics-addr": "127.0.0.1:9323",
"experimental": true,
"insecure-registries": ["registry.internal:5000"]
}
dest: /etc/docker/daemon.json
notify: restart docker
- name: Enable and start Docker service
systemd:
name: docker
enabled: yes
state: started
- name: Add users to docker group
user:
name: "{{ item }}"
groups: docker
append: yes
loop:
- devops
- deploy
- name: Configure firewall rules
ufw:
rule: allow
port: "{{ item }}"
proto: tcp
state: enabled
loop:
- "22"
- "80"
- "443"
- "8080"
- name: Set up log rotation
copy:
content: |
/var/log/app/*.log {
daily
rotate 14
compress
delaycompress
notifempty
create 0640 app app
sharedscripts
postrotate
systemctl reload app
endscript
}
dest: /etc/logrotate.d/app
handlers:
- name: restart docker
systemd:
name: docker
state: restarted#!/usr/bin/env python3
"""
FinOps Cloud Cost Optimization Script
Identifies unused resources and cost optimization opportunities
"""
import boto3
import json
from datetime import datetime, timedelta
from typing import Dict, List
class CloudCostOptimizer:
def __init__(self, region='us-east-1'):
self.ec2 = boto3.client('ec2', region_name=region)
self.cloudwatch = boto3.client('cloudwatch', region_name=region)
self.rds = boto3.client('rds', region_name=region)
def find_idle_ec2_instances(self, days=7) -> List[Dict]:
"""Find EC2 instances with low CPU utilization"""
idle_instances = []
instances = self.ec2.describe_instances(
Filters=[{'Name': 'instance-state-name', 'Values': ['running']}]
)
end_time = datetime.utcnow()
start_time = end_time - timedelta(days=days)
for reservation in instances['Reservations']:
for instance in reservation['Instances']:
instance_id = instance['InstanceId']
response = self.cloudwatch.get_metric_statistics(
Namespace='AWS/EC2',
MetricName='CPUUtilization',
Dimensions=[{'Name': 'InstanceId', 'Value': instance_id}],
StartTime=start_time,
EndTime=end_time,
Period=3600,
Statistics=['Average']
)
if response['Datapoints']:
avg_cpu = sum(dp['Average'] for dp in response['Datapoints']) / len(response['Datapoints'])
if avg_cpu < 5:
idle_instances.append({
'InstanceId': instance_id,
'Type': instance['InstanceType'],
'AvgCPU': round(avg_cpu, 2),
'LaunchTime': instance['LaunchTime'].isoformat()
})
return idle_instances
def find_unattached_ebs_volumes(self) -> List[Dict]:
"""Find unattached EBS volumes"""
unattached = []
volumes = self.ec2.describe_volumes(
Filters=[{'Name': 'status', 'Values': ['available']}]
)
for volume in volumes['Volumes']:
unattached.append({
'VolumeId': volume['VolumeId'],
'Size': volume['Size'],
'Type': volume['VolumeType'],
'CreateTime': volume['CreateTime'].isoformat()
})
return unattached
def find_overprovisioned_rds(self) -> List[Dict]:
"""Find RDS instances with low utilization"""
overprovisioned = []
databases = self.rds.describe_db_instances()
for db in databases['DBInstances']:
db_id = db['DBInstanceIdentifier']
end_time = datetime.utcnow()
start_time = end_time - timedelta(days=7)
response = self.cloudwatch.get_metric_statistics(
Namespace='AWS/RDS',
MetricName='CPUUtilization',
Dimensions=[{'Name': 'DBInstanceIdentifier', 'Value': db_id}],
StartTime=start_time,
EndTime=end_time,
Period=3600,
Statistics=['Average']
)
if response['Datapoints']:
avg_cpu = sum(dp['Average'] for dp in response['Datapoints']) / len(response['Datapoints'])
if avg_cpu < 10:
overprovisioned.append({
'DBInstanceIdentifier': db_id,
'InstanceClass': db['DBInstanceClass'],
'AvgCPU': round(avg_cpu, 2),
'Engine': db['Engine']
})
return overprovisioned
def generate_report(self) -> Dict:
"""Generate comprehensive cost optimization report"""
report = {
'timestamp': datetime.utcnow().isoformat(),
'idle_ec2_instances': self.find_idle_ec2_instances(),
'unattached_ebs_volumes': self.find_unattached_ebs_volumes(),
'overprovisioned_rds': self.find_overprovisioned_rds()
}
return report
def save_report(self, filename='cost_report.json'):
"""Save report to file"""
report = self.generate_report()
with open(filename, 'w') as f:
json.dump(report, f, indent=2)
print(f"Report saved to {filename}")
return report
if __name__ == '__main__':
optimizer = CloudCostOptimizer(region='us-east-1')
report = optimizer.save_report()
print(f"\n📊 Cost Optimization Report")
print(f"Idle EC2 Instances: {len(report['idle_ec2_instances'])}")
print(f"Unattached EBS Volumes: {len(report['unattached_ebs_volumes'])}")
print(f"Overprovisioned RDS: {len(report['overprovisioned_rds'])}")#!/bin/bash
# Comprehensive infrastructure monitoring and alerting script
set -euo pipefail
# Configuration
SLACK_WEBHOOK="${SLACK_WEBHOOK_URL}"
LOG_FILE="/var/log/infrastructure-monitor.log"
THRESHOLDS={
"cpu_usage": 80,
"memory_usage": 85,
"disk_usage": 90,
"load_average": 4
}
# Colors for output
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m' # No Color
log_message() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" >> "${LOG_FILE}"
}
check_cpu_usage() {
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print int($2)}')
if [ "${cpu_usage}" -gt "${THRESHOLDS[cpu_usage]}" ]; then
echo -e "${RED}⚠️ HIGH CPU USAGE: ${cpu_usage}%${NC}"
log_message "ALERT: CPU usage at ${cpu_usage}%"
send_slack_alert "🔴 HIGH CPU USAGE: ${cpu_usage}%"
return 1
else
echo -e "${GREEN}✓ CPU Usage: ${cpu_usage}%${NC}"
fi
return 0
}
check_memory_usage() {
local mem_usage=$(free | grep Mem | awk '{printf("%.0f", ($3/$2) * 100)}')
if [ "${mem_usage}" -gt "${THRESHOLDS[memory_usage]}" ]; then
echo -e "${RED}⚠️ HIGH MEMORY USAGE: ${mem_usage}%${NC}"
log_message "ALERT: Memory usage at ${mem_usage}%"
send_slack_alert "🔴 HIGH MEMORY USAGE: ${mem_usage}%"
return 1
else
echo -e "${GREEN}✓ Memory Usage: ${mem_usage}%${NC}"
fi
return 0
}
check_disk_usage() {
while IFS= read -r line; do
disk_usage=$(echo "${line}" | awk '{print $5}' | sed 's/%//')
mount_point=$(echo "${line}" | awk '{print $6}')
if [ "${disk_usage}" -gt "${THRESHOLDS[disk_usage]}" ]; then
echo -e "${RED}⚠️ HIGH DISK USAGE on ${mount_point}: ${disk_usage}%${NC}"
log_message "ALERT: Disk usage at ${disk_usage}% on ${mount_point}"
send_slack_alert "🔴 HIGH DISK USAGE on ${mount_point}: ${disk_usage}%"
return 1
fi
done < <(df -h | grep -vE '^Filesystem|tmpfs|cdrom' | awk '$5 > 0')
echo -e "${GREEN}✓ Disk Usage: Normal${NC}"
return 0
}
check_service_health() {
local services=("docker" "kubelet" "nginx")
for service in "${services[@]}"; do
if systemctl is-active --quiet "${service}"; then
echo -e "${GREEN}✓ ${service}: Running${NC}"
else
echo -e "${RED}⚠️ ${service}: STOPPED${NC}"
log_message "ALERT: Service ${service} is not running"
send_slack_alert "🔴 Service DOWN: ${service}"
return 1
fi
done
return 0
}
send_slack_alert() {
local message="$1"
curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"${message}\"}" \
"${SLACK_WEBHOOK}"
}
generate_report() {
echo "========================================"
echo "Infrastructure Health Report"
echo "Generated: $(date)"
echo "========================================"
check_cpu_usage
check_memory_usage
check_disk_usage
check_service_health
echo "========================================"
}
# Main execution
main() {
generate_report
log_message "Health check completed"
}
main "$@"
🎯 Learning Philosophy
In the world of DevOps & Cloud Engineering, the only constant is change.
Every challenge is an opportunity to learn. Every failure teaches a lesson.
"The expert in anything was once a beginner." — Helen Hayes
| 📚 Continuous Learning | 🔬 Experimentation | 🤝 Knowledge Sharing |
| Stay curious & update skills | Build, test & iterate | Mentor & contribute to community |
