Multi-Modal-Convergence-Detection/MultiModalRLMAnalyzer.py at main · Soum-Code/Multi-Modal-Convergence-Detection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# MultiModalRLMAnalyzer.py - Core analysis framework
import torch
import numpy as np
from typing import List, Dict, Tuple
from sentence_transformers import SentenceTransformer
from PIL import Image
import requests
from io import BytesIO


class MultiModalFailureAnalyzer:
    """Analyze failure modes specific to multi-modal RLMs"""

    def __init__(self):
        # Load text embedding model for semantic analysis
        self.text_model = SentenceTransformer('all-MiniLM-L6-v2')

    def analyze_cross_modal_drift(self, text_outputs: List[str],
                                  image_descriptions: List[str]) -> Dict:
        """Detect drift between text and image modalities"""
        drift_metrics = {
            'semantic_alignment': [],
            'temporal_consistency': [],
            'modality_gap': []
        }

        for i in range(1, len(text_outputs)):
            # Text similarity
            text_embeds = self.text_model.encode([text_outputs[i - 1], text_outputs[i]])
            text_sim = np.dot(text_embeds[0], text_embeds[1]) / (
                np.linalg.norm(text_embeds[0]) * np.linalg.norm(text_embeds[1])
            )

            # Image description similarity
            if i < len(image_descriptions):
                img_embeds = self.text_model.encode([
                    image_descriptions[i - 1], image_descriptions[i]
                ])
                img_sim = np.dot(img_embeds[0], img_embeds[1]) / (
                    np.linalg.norm(img_embeds[0]) * np.linalg.norm(img_embeds[1])
                )

                # Cross-modal alignment
                alignment = abs(text_sim - img_sim)
                drift_metrics['semantic_alignment'].append(alignment)
                drift_metrics['modality_gap'].append(abs(text_sim - img_sim))

        return drift_metrics

    def identify_failure_patterns(self, multimodal_outputs: List[Dict]) -> Dict:
        """Identify specific multi-modal failure patterns"""
        failure_patterns = {
            'cross_modal_inconsistency': 0,
            'visual_reasoning_breakdown': 0,
            'text_image_misalignment': 0,
            'modality_dominance': 0
        }

        for output in multimodal_outputs:
            # Check for visual-text inconsistencies
            if 'image_caption' in output and 'text_response' in output:
                # Simple keyword mismatch detection
                text_keywords = set(output['text_response'].lower().split())
                image_keywords = set(output['image_caption'].lower().split())

                if len(text_keywords & image_keywords) / max(1, len(text_keywords | image_keywords)) < 0.3:
                    failure_patterns['cross_modal_inconsistency'] += 1

                # Check if one modality dominates
                if len(output['text_response']) > 5 * len(output['image_caption']):
                    failure_patterns['modality_dominance'] += 1

        return failure_patterns


class BasicMultiModalRLM:
    """Basic multi-modal recursive language model (requires BLIP-2 download)"""

    def __init__(self, model_name="Salesforce/blip2-flan-t5-xl"):
        from transformers import Blip2Processor, Blip2ForConditionalGeneration
        self.processor = Blip2Processor.from_pretrained(model_name)
        self.model = Blip2ForConditionalGeneration.from_pretrained(model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def generate_response(self, image, prompt: str, max_length: int = 100) -> Dict:
        """Generate both text and image understanding"""
        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)

        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_length=max_length)

        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Also get image understanding
        image_inputs = self.processor(images=image, return_tensors="pt").to(self.device)
        with torch.no_grad():
            image_caption_ids = self.model.generate(**image_inputs, max_length=50)
        image_caption = self.processor.batch_decode(image_caption_ids, skip_special_tokens=True)[0]

        return {
            'text_response': generated_text,
            'image_caption': image_caption,
            'combined_output': f"Text: {generated_text} | Image: {image_caption}"
        }


class MultiModalBenchmark:
    """Benchmark system for multi-modal RLM evaluation"""

    def __init__(self):
        self.test_cases = [
            {
                'name': 'Visual Question Answering',
                'task': 'Answer questions about images',
                'prompts': [
                    "What objects are in this image?",
                    "Describe the scene in detail.",
                    "What is the main subject?"
                ]
            },
            {
                'name': 'Image Caption Enhancement',
                'task': 'Improve image captions iteratively',
                'prompts': [
                    "Make this caption more descriptive:",
                    "Add more details to this description:",
                    "Enhance the storytelling aspect:"
                ]
            },
            {
                'name': 'Visual Reasoning Chain',
                'task': 'Solve problems using both text and images',
                'prompts': [
                    "Based on this diagram, explain the process:",
                    "Analyze this chart and describe trends:",
                    "Interpret this infographic step by step:"
                ]
            }
        ]

    def load_test_image(self, url: str) -> Image.Image:
        """Load test image from URL"""
        response = requests.get(url)
        return Image.open(BytesIO(response.content))


# ── Phase 1 driver ──────────────────────────────────────────────────────────
def phase1_initial_analysis():
    """Phase 1: Initial multi-modal analysis"""
    print("=" * 60)
    print("PHASE 1: MULTI-MODAL CONVERGENCE ANALYSIS")
    print("=" * 60)

    # Initialize analyzer
    analyzer = MultiModalFailureAnalyzer()
    benchmark = MultiModalBenchmark()

    # Test with sample data
    sample_text_outputs = [
        "The image shows a red apple on a wooden table.",
        "The image depicts a red apple sitting on a brown wooden surface.",
        "A crimson apple rests upon a rustic wooden tabletop in the image."
    ]

    sample_image_descriptions = [
        "red apple wooden table",
        "apple on table brown wood",
        "crimson fruit wooden surface"
    ]

    # Analyze cross-modal drift
    drift_analysis = analyzer.analyze_cross_modal_drift(
        sample_text_outputs, sample_image_descriptions
    )

    print("\nCross-Modal Drift Analysis:")
    print(f"  Average Semantic Alignment: {np.mean(drift_analysis['semantic_alignment']):.3f}")
    print(f"  Average Modality Gap: {np.mean(drift_analysis['modality_gap']):.3f}")

    # Identify failure patterns
    sample_multimodal_outputs = [
        {
            'text_response': "The image shows a beautiful sunset over mountains.",
            'image_caption': "mountains landscape nature"
        },
        {
            'text_response': "This depicts a serene lake reflecting clouds in the sky.",
            'image_caption': "water body peaceful scenery"
        }
    ]

    failure_patterns = analyzer.identify_failure_patterns(sample_multimodal_outputs)
    print("\nFailure Pattern Analysis:")
    for pattern, count in failure_patterns.items():
        print(f"  {pattern}: {count}")

    return drift_analysis, failure_patterns


if __name__ == "__main__":
    phase1_initial_analysis()