diff --git a/pyhealth/nlp/__init__.py b/pyhealth/nlp/__init__.py
new file mode 100644
index 000000000..c34caa6b0
--- /dev/null
+++ b/pyhealth/nlp/__init__.py
@@ -0,0 +1 @@
+from .sentiment_scorer import SentimentScorer, normalize_sentiment_scores
diff --git a/pyhealth/nlp/sentiment_scorer.py b/pyhealth/nlp/sentiment_scorer.py
new file mode 100644
index 000000000..597bbc37e
--- /dev/null
+++ b/pyhealth/nlp/sentiment_scorer.py
@@ -0,0 +1,159 @@
+"""
+Sentence-level sentiment scoring for clinical text
+====================================================
+Provides a VADER-based ``SentimentScorer`` that avoids the full-text
+saturation problem specific to clinical discharge notes, and a utility for
+Z-score normalising a column of raw scores across a sample dataset.
+
+Background
+----------
+Standard full-text VADER compound scoring saturates at −1.0 for >94 % of
+clinical discharge summaries because clinical language is lexically negative
+("pain", "failure", "respiratory distress").  After Z-scoring a saturated
+distribution every patient receives the same score, making the metric useless
+for discrimination.
+
+Sentence-level averaging (score each sentence independently, take the mean)
+avoids saturation and closely approximates the word-averaged approach used by
+the ``pattern.en`` library in the original Boag et al. 2018 implementation.
+
+Reference
+---------
+Boag et al. "Racial Disparities and Mistrust in End-of-Life Care."
+MLHC 2018. https://arxiv.org/abs/1808.03827
+"""
+
+from typing import Dict, List, Optional
+
+import numpy as np
+
+
+class SentimentScorer:
+    """Sentence-level VADER sentiment scorer for clinical text.
+
+    Scores a document by:
+      1. Tokenising into sentences with NLTK's ``sent_tokenize``.
+      2. Computing the VADER compound score for each sentence.
+      3. Returning the mean compound score across all sentences.
+
+    This avoids the full-text VADER saturation problem that affects clinical
+    discharge notes (>94 % saturate at −1.0).
+
+    The scorer is intentionally stateless and thread-safe after initialisation.
+
+    Args:
+        language: Reserved for future multilingual support. Only ``"english"``
+            is currently supported (NLTK sentence tokeniser).
+
+    Examples:
+        >>> scorer = SentimentScorer()
+        >>> scorer.score("The patient is calm and alert. No acute distress.")
+        0.412
+        >>> scorer.score("Patient unresponsive, severe respiratory failure.")
+        -0.613
+    """
+
+    def __init__(self, language: str = "english") -> None:
+        try:
+            from nltk.sentiment.vader import SentimentIntensityAnalyzer
+            from nltk.tokenize import sent_tokenize
+        except ImportError as e:
+            raise ImportError(
+                "nltk is required for SentimentScorer. "
+                "Install with: pip install nltk && python -c "
+                "\"import nltk; nltk.download('vader_lexicon'); "
+                "nltk.download('punkt_tab')\""
+            ) from e
+
+        self._sid = SentimentIntensityAnalyzer()
+        self._sent_tokenize = sent_tokenize
+        self.language = language
+
+    def score(self, text: str) -> float:
+        """Compute the mean sentence-level VADER compound score for a document.
+
+        Args:
+            text: Raw document text. Empty or whitespace-only text returns 0.0.
+
+        Returns:
+            Mean VADER compound score in [−1.0, +1.0].
+            Higher values indicate more positive sentiment.
+        """
+        if not text or not text.strip():
+            return 0.0
+        sentences = self._sent_tokenize(text)
+        if not sentences:
+            return 0.0
+        scores = [
+            self._sid.polarity_scores(s)["compound"]
+            for s in sentences
+        ]
+        return float(np.mean(scores))
+
+    def score_batch(self, texts: List[str]) -> List[float]:
+        """Score a list of documents.
+
+        Args:
+            texts: List of raw document strings.
+
+        Returns:
+            List of mean sentence-level compound scores.
+        """
+        return [self.score(t) for t in texts]
+
+    def negate_and_zscore(self, raw_scores: Dict) -> Dict:
+        """Negate and Z-score a dict of {key: raw_score} values.
+
+        Applies the normalisation from Boag et al. 2018:
+
+            neg_score[key] = -(raw_score[key] - μ) / σ
+
+        Higher output value → more negative sentiment → more mistrust signal.
+
+        Args:
+            raw_scores: Dict mapping any key to a raw compound score.
+
+        Returns:
+            Dict with the same keys mapped to negated Z-scores.
+        """
+        vals = np.array(list(raw_scores.values()), dtype=np.float64)
+        mu, sigma = vals.mean(), vals.std()
+        if sigma == 0.0:
+            return {k: 0.0 for k in raw_scores}
+        return {k: float(-(v - mu) / sigma) for k, v in raw_scores.items()}
+
+
+def normalize_sentiment_scores(sample_dataset, feature_key: str = "neg_sentiment") -> None:
+    """Z-score normalise a raw sentiment feature column in-place across all samples.
+
+    The ``MistrustSentimentMIMIC3`` task stores the *raw* negated sentiment
+    score (``-mean_sentence_compound``) per sample.  Because Z-scoring requires
+    the global mean and standard deviation across all patients, it cannot be
+    done inside ``__call__`` (which processes one patient at a time).  Call
+    this utility **after** ``dataset.set_task()`` to complete the normalisation.
+
+    The transformation applied is identical to Boag et al. 2018:
+
+        neg_score = -(raw_score - μ_all) / σ_all
+
+    Args:
+        sample_dataset: A PyHealth ``SampleDataset`` produced by
+            ``base_dataset.set_task(MistrustSentimentMIMIC3(...))``.
+        feature_key: Name of the sentiment feature in each sample.
+            Defaults to ``"neg_sentiment"``.
+
+    Example:
+        >>> sample_dataset = base_dataset.set_task(MistrustSentimentMIMIC3(...))
+        >>> normalize_sentiment_scores(sample_dataset)
+        >>> # neg_sentiment values are now Z-scored across the full dataset
+    """
+    raw_vals = np.array(
+        [s[feature_key][0] for s in sample_dataset.samples],
+        dtype=np.float64,
+    )
+    mu, sigma = raw_vals.mean(), raw_vals.std()
+    if sigma == 0.0:
+        return
+    for sample in sample_dataset.samples:
+        raw = sample[feature_key][0]
+        sample[feature_key] = [float((raw - mu) / sigma)]
diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py
index 797988377..bd653780e 100644
--- a/pyhealth/tasks/__init__.py
+++ b/pyhealth/tasks/__init__.py
@@ -66,3 +66,5 @@
     VariantClassificationClinVar,
 )
 from .patient_linkage_mimic3 import PatientLinkageMIMIC3Task
+
+from .sentiment_mimic3 import MistrustSentimentMIMIC3
diff --git a/pyhealth/tasks/sentiment_mimic3.py b/pyhealth/tasks/sentiment_mimic3.py
new file mode 100644
index 000000000..0cad4c8c0
--- /dev/null
+++ b/pyhealth/tasks/sentiment_mimic3.py
@@ -0,0 +1,219 @@
+"""
+Negative Sentiment Mistrust Task for MIMIC-III
+===============================================
+Implements the sentiment-based mistrust proxy from:
+
+    Boag et al. "Racial Disparities and Mistrust in End-of-Life Care."
+    MLHC 2018. https://arxiv.org/abs/1808.03827
+
+The original paper used ``pattern.en.sentiment(text.split())`` — a
+word-averaged polarity approach — on concatenated discharge notes.  Full-text
+VADER compound scoring saturates at −1.0 for >94 % of clinical notes, making
+it unsuitable as a direct replacement.  Sentence-level VADER averaging avoids
+saturation and closely reproduces the original's discriminative power.
+
+Method
+------
+For each hospital admission:
+  1. Collect all discharge summary notes (``CATEGORY = 'Discharge summary'``).
+  2. Score each note by averaging VADER compound scores across its sentences.
+  3. Average across multiple notes for the same admission.
+  4. Negate: ``raw_neg_score = -mean_sentence_polarity``
+     (higher → more negative → more mistrust signal).
+
+Z-score normalisation (``neg_score = -(raw - μ) / σ``) requires global
+statistics and must be applied **after** ``set_task()`` using the provided
+``normalize_sentiment_scores`` utility from ``pyhealth.nlp``.
+
+Output feature
+--------------
+``neg_sentiment`` — a single-element list ``[float]`` stored as a ``"tensor"``
+feature.  The one-element list satisfies PyHealth's TensorProcessor which
+expects an iterable of numerics.
+
+Usage
+-----
+    >>> from pyhealth.datasets import MIMIC3Dataset
+    >>> from pyhealth.tasks import MistrustSentimentMIMIC3
+    >>> from pyhealth.nlp import normalize_sentiment_scores
+    >>> from pyhealth.models import LogisticRegression
+    >>>
+    >>> base_dataset = MIMIC3Dataset(
+    ...     root="/path/to/mimic-iii/1.4",
+    ...     tables=["NOTEEVENTS"],
+    ... )
+    >>> task = MistrustSentimentMIMIC3()
+    >>> sample_dataset = base_dataset.set_task(task)
+    >>> normalize_sentiment_scores(sample_dataset)   # Z-score in-place
+    >>> model = LogisticRegression(dataset=sample_dataset)
+"""
+
+from typing import Any, Dict, List, Optional
+
+from pyhealth.tasks.base_task import BaseTask
+
+
+class MistrustSentimentMIMIC3(BaseTask):
+    """Compute negative-sentiment mistrust proxy from MIMIC-III discharge notes.
+
+    For each hospital admission the task produces one sample:
+
+    - ``neg_sentiment``: a one-element list ``[float]`` — the raw negated
+      mean sentence-level VADER compound score across all discharge summaries
+      for this admission (schema: ``"tensor"``).  Values are negated so that
+      higher = more negative sentiment = more mistrust signal.  Call
+      ``pyhealth.nlp.normalize_sentiment_scores(sample_dataset)`` after
+      ``set_task()`` to complete the Z-score normalisation step.
+
+    - ``noncompliance``: ``1`` if any note for this admission contains
+      ``"noncompliant"``, else ``0`` (schema: ``"binary"``).  This matches
+      the output label of ``MistrustNoncomplianceMIMIC3``, enabling direct
+      comparison of the three mistrust proxies on the same task.
+
+    Args:
+        min_notes: Minimum number of discharge summary notes required for a
+            sample to be included.  Defaults to 1.
+        output_label: Column name and key of the binary output label.
+            Change to ``"autopsy_consent"`` to align with
+            ``MistrustAutopsyMIMIC3``. Defaults to ``"noncompliance"``.
+
+    Examples:
+        >>> from pyhealth.datasets import MIMIC3Dataset
+        >>> from pyhealth.tasks import MistrustSentimentMIMIC3
+        >>> from pyhealth.nlp import normalize_sentiment_scores
+        >>>
+        >>> base_dataset = MIMIC3Dataset(
+        ...     root="/path/to/mimic-iii/1.4",
+        ...     tables=["NOTEEVENTS"],
+        ... )
+        >>> task = MistrustSentimentMIMIC3()
+        >>> sample_dataset = base_dataset.set_task(task)
+        >>> normalize_sentiment_scores(sample_dataset)
+        >>> len(sample_dataset)
+        52726
+    """
+
+    task_name: str = "MistrustSentimentMIMIC3"
+    input_schema: Dict[str, str] = {"neg_sentiment": "tensor"}
+    output_schema: Dict[str, str] = {"noncompliance": "binary"}
+
+    def __init__(
+        self,
+        min_notes: int = 1,
+        output_label: str = "noncompliance",
+    ) -> None:
+        self.min_notes = min_notes
+        self.output_label = output_label
+        # output_schema is a class attribute; update it to reflect output_label
+        self.output_schema = {output_label: "binary"}
+
+        # Lazy-initialise scorer to avoid importing nltk at module load time
+        self._scorer: Optional[Any] = None
+
+    def _get_scorer(self):
+        """Lazily initialise SentimentScorer on first use."""
+        if self._scorer is None:
+            from pyhealth.nlp import SentimentScorer
+            self._scorer = SentimentScorer()
+        return self._scorer
+
+    @staticmethod
+    def _noncompliance_label(noteevents: List[Any]) -> int:
+        """Return 1 if any note contains 'noncompliant', else 0."""
+        for ev in noteevents:
+            if "noncompliant" in str(getattr(ev, "text", "") or "").lower():
+                return 1
+        return 0
+
+    @staticmethod
+    def _autopsy_label(noteevents: List[Any]) -> Optional[int]:
+        """Return autopsy consent label (1/0) or None if absent/ambiguous."""
+        consented = declined = False
+        for ev in noteevents:
+            text = str(getattr(ev, "text", "") or "").lower()
+            if "autopsy" not in text:
+                continue
+            for line in text.split("\n"):
+                if "autopsy" not in line:
+                    continue
+                if any(w in line for w in ("decline", "not consent", "refuse", "denied")):
+                    declined = True
+                if any(w in line for w in ("consent", "agree", "request")):
+                    consented = True
+        if consented and declined:
+            return None
+        if consented:
+            return 1
+        if declined:
+            return 0
+        return None
+
+    def __call__(self, patient: Any) -> List[Dict[str, Any]]:
+        """Process a single patient into negative-sentiment classification samples.
+
+        Args:
+            patient: a PyHealth Patient object with ``noteevents`` loaded.
+
+        Returns:
+            List of dicts, one per admission that has ≥ ``min_notes`` discharge
+            summaries, each containing:
+                - ``patient_id``
+                - ``visit_id`` (hadm_id)
+                - ``neg_sentiment`` (list of one float — raw negated score)
+                - output label (``noncompliance`` or ``autopsy_consent``)
+        """
+        scorer = self._get_scorer()
+        samples = []
+        admissions = patient.get_events(event_type="admissions")
+
+        for admission in admissions:
+            hadm_id = admission.hadm_id
+
+            noteevents = patient.get_events(
+                event_type="noteevents",
+                filters=[("hadm_id", "==", hadm_id)],
+            )
+
+            # Extract discharge summaries only
+            discharge_notes = [
+                ev for ev in noteevents
+                if str(getattr(ev, "category", "") or "").strip().lower()
+                == "discharge summary"
+            ]
+
+            if len(discharge_notes) < self.min_notes:
+                continue
+
+            # Score each note; average across notes for this admission
+            note_scores = [
+                scorer.score(str(getattr(ev, "text", "") or ""))
+                for ev in discharge_notes
+            ]
+            raw_mean = float(sum(note_scores) / len(note_scores))
+
+            # Negate: higher value = more negative sentiment = more mistrust
+            raw_neg = -raw_mean
+
+            # Derive output label
+            if self.output_label == "noncompliance":
+                label = self._noncompliance_label(noteevents)
+            elif self.output_label == "autopsy_consent":
+                label = self._autopsy_label(noteevents)
+                if label is None:
+                    continue   # exclude ambiguous/absent autopsy signal
+            else:
+                raise ValueError(
+                    f"output_label must be 'noncompliance' or 'autopsy_consent', "
+                    f"got '{self.output_label}'"
+                )
+
+            samples.append(
+                {
+                    "patient_id": patient.patient_id,
+                    "visit_id": hadm_id,
+                    "neg_sentiment": [raw_neg],   # 1-element list for TensorProcessor
+                    self.output_label: label,
+                }
+            )
+
+        return samples