diff --git a/tests/test_semantic/__init__.py b/tests/test_semantic/__init__.py
new file mode 100644
index 0000000..90cad00
--- /dev/null
+++ b/tests/test_semantic/__init__.py
@@ -0,0 +1 @@
+"""Tests for semantic similarity module."""
diff --git a/tests/test_semantic/test_similarity.py b/tests/test_semantic/test_similarity.py
new file mode 100644
index 0000000..ce7d762
--- /dev/null
+++ b/tests/test_semantic/test_similarity.py
@@ -0,0 +1,208 @@
+"""Tests for the semantic similarity metric."""
+
+import pytest
+
+# Skip all tests if sentence-transformers is not installed
+pytest.importorskip("sentence_transformers")
+
+from veritext.metrics.results import SemanticResult
+from veritext.semantic import SemanticSimilarity
+
+
+class TestSemanticSimilarity:
+    @pytest.fixture
+    def semantic(self) -> SemanticSimilarity:
+        return SemanticSimilarity()
+
+    def test_name(self, semantic: SemanticSimilarity) -> None:
+        assert semantic.name == "semantic"
+
+    def test_requires_reference(self, semantic: SemanticSimilarity) -> None:
+        assert semantic.requires_reference is True
+
+    def test_identical_texts(self, semantic: SemanticSimilarity) -> None:
+        text = "The cat sat on the mat"
+        result = semantic.score(text, text)
+
+        # Identical texts should have very high similarity (close to 1.0)
+        assert result.similarity >= 0.99
+        assert result.model == "all-MiniLM-L6-v2"
+
+    def test_semantically_similar_texts(self, semantic: SemanticSimilarity) -> None:
+        candidate = "The cat sat on the mat"
+        reference = "A feline rested on the rug"
+        result = semantic.score(candidate, reference)
+
+        # Similar meanings should have reasonable similarity
+        assert result.similarity > 0.3
+
+    def test_unrelated_texts(self, semantic: SemanticSimilarity) -> None:
+        candidate = "The quick brown fox"
+        reference = "Quantum physics describes particle behaviour"
+        result = semantic.score(candidate, reference)
+
+        # Unrelated texts should have low similarity
+        assert result.similarity < 0.5
+
+    def test_empty_candidate(self, semantic: SemanticSimilarity) -> None:
+        result = semantic.score("", "The cat sat on the mat")
+        assert result.similarity == 0.0
+
+    def test_whitespace_only_candidate(self, semantic: SemanticSimilarity) -> None:
+        result = semantic.score("   \t\n  ", "The cat sat on the mat")
+        assert result.similarity == 0.0
+
+    def test_none_reference_raises(self, semantic: SemanticSimilarity) -> None:
+        with pytest.raises(ValueError, match="requires reference"):
+            semantic.score("The cat sat", None)
+
+    def test_empty_reference_raises(self, semantic: SemanticSimilarity) -> None:
+        with pytest.raises(ValueError, match="cannot be empty"):
+            semantic.score("The cat sat", "")
+
+    def test_whitespace_reference_raises(self, semantic: SemanticSimilarity) -> None:
+        with pytest.raises(ValueError, match="cannot be empty"):
+            semantic.score("The cat sat", "   \t\n  ")
+
+    def test_multiple_references(self, semantic: SemanticSimilarity) -> None:
+        candidate = "The cat sat on the mat"
+        references = [
+            "A dog ran through the park",
+            "The cat sat on the mat",  # Exact match
+        ]
+        result = semantic.score(candidate, references)
+
+        # Should get high similarity due to exact match reference
+        assert result.similarity >= 0.99
+
+    def test_multiple_references_takes_max(self, semantic: SemanticSimilarity) -> None:
+        candidate = "The cat sat on the mat"
+        references = [
+            "Quantum physics is complex",  # Low similarity
+            "A feline rested on the rug",  # Higher similarity
+        ]
+        result = semantic.score(candidate, references)
+
+        # Should use the higher similarity
+        assert result.similarity > 0.3
+
+    def test_result_score_property(self, semantic: SemanticSimilarity) -> None:
+        result = semantic.score("The cat sat", "The cat sat")
+        assert result.score == result.similarity
+
+    def test_caching_behaviour(self) -> None:
+        semantic = SemanticSimilarity(cache_embeddings=True)
+
+        # Score same texts multiple times
+        text = "The cat sat on the mat"
+        result1 = semantic.score(text, text)
+        result2 = semantic.score(text, text)
+
+        # Results should be identical
+        assert result1.similarity == result2.similarity
+
+        # Clear cache and check again
+        semantic.clear_cache()
+        result3 = semantic.score(text, text)
+        assert result3.similarity == result1.similarity
+
+    def test_caching_disabled(self) -> None:
+        semantic = SemanticSimilarity(cache_embeddings=False)
+
+        text = "The cat sat on the mat"
+        result1 = semantic.score(text, text)
+        result2 = semantic.score(text, text)
+
+        # Results should still be identical (just not cached)
+        assert result1.similarity == result2.similarity
+
+        # Clear cache should not raise even when disabled
+        semantic.clear_cache()
+
+    def test_custom_model(self) -> None:
+        # Use the same model but verify it's recorded correctly
+        semantic = SemanticSimilarity(model="all-MiniLM-L6-v2")
+        result = semantic.score("Test text", "Test text")
+        assert result.model == "all-MiniLM-L6-v2"
+
+
+class TestSemanticSimilarityBatch:
+    @pytest.fixture
+    def semantic(self) -> SemanticSimilarity:
+        return SemanticSimilarity()
+
+    def test_batch_score_basic(self, semantic: SemanticSimilarity) -> None:
+        candidates = ["The cat sat on the mat", "A quick brown dog runs fast"]
+        references = ["The cat sat on the mat", "A quick brown dog runs fast"]
+        result = semantic.batch_score(candidates, references)
+
+        assert result.count == 2
+        assert len(result.results) == 2
+        # Identical texts should have very high similarity
+        assert all(r.similarity >= 0.99 for r in result.results)
+
+    def test_batch_score_statistics(self, semantic: SemanticSimilarity) -> None:
+        candidates = ["The cat sat", "Quantum physics is complex"]
+        references = ["The cat sat", "The cat sat"]
+        result = semantic.batch_score(candidates, references)
+
+        # Check statistics are computed
+        assert "similarity" in result.stats
+
+        # Mean should be between min and max
+        stats = result.stats["similarity"]
+        assert stats.min <= stats.mean <= stats.max
+
+    def test_batch_score_percentiles(self, semantic: SemanticSimilarity) -> None:
+        candidates = ["a", "b", "c", "d", "e"]
+        references = ["a", "b", "c", "d", "e"]
+        result = semantic.batch_score(candidates, references)
+
+        stats = result.stats["similarity"]
+        assert 25 in stats.percentiles
+        assert 50 in stats.percentiles
+        assert 75 in stats.percentiles
+        assert 95 in stats.percentiles
+
+    def test_batch_score_none_references_raises(
+        self, semantic: SemanticSimilarity
+    ) -> None:
+        with pytest.raises(ValueError, match="requires reference"):
+            semantic.batch_score(["text"], None)
+
+    def test_batch_score_length_mismatch_raises(
+        self, semantic: SemanticSimilarity
+    ) -> None:
+        with pytest.raises(ValueError, match="must match"):
+            semantic.batch_score(["a", "b"], ["a"])
+
+    def test_batch_score_multi_refs(
+        self, semantic: SemanticSimilarity
+    ) -> None:
+        candidates = [
+            "The cat sat on the mat",
+            "A quick brown dog runs fast",
+        ]
+        references = [
+            ["The cat sat on the mat", "A cat rests on floor"],
+            ["A quick brown dog runs fast", "Dogs run very quickly"],
+        ]
+        result = semantic.batch_score(candidates, references)
+
+        assert result.count == 2
+        # First pair has exact match
+        assert result.results[0].similarity >= 0.99
+        assert result.results[1].similarity >= 0.99
+
+
+class TestSemanticResult:
+    def test_frozen(self) -> None:
+        from pydantic import ValidationError
+
+        result = SemanticResult(similarity=0.85, model="test-model")
+        with pytest.raises(ValidationError):
+            result.similarity = 0.9  # type: ignore[misc]
+
+    def test_score_property(self) -> None:
+        result = SemanticResult(similarity=0.75, model="test-model")
+        assert result.score == 0.75