speedata · pgundlach · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,22 @@
+name: Test
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ["1.24", "stable"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+      - run: go test -v ./...
+        working-directory: scanner
+      - run: go test -fuzz=FuzzScanner -fuzztime=30s ./...
+        working-directory: scanner
diff --git a/.gitignore b/.gitignore
@@ -1,22 +1,3 @@
-# Compiled Object files, Static and Dynamic libs (Shared Objects)
-*.o
-*.a
-*.so
-
-# Folders
-_obj
-_test
-
-# Architecture specific extensions/prefixes
-*.[568vq]
-[568vq].out
-
-*.cgo1.go
-*.cgo2.c
-_cgo_defun.c
-_cgo_gotypes.go
-_cgo_export.*
-
-_testmain.go
-
 *.exe
+*.test
+*.out
diff --git a/.travis.yml b/.travis.yml
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -1,6 +1,7 @@
-The following contributors hold copyright rights to this package, licensed
-in accordance with the license:
+The following contributors hold copyright over portions of this package,
+licensed in accordance with the LICENSE file:
 
-Copyright 2012 The Gorilla Authors
-Copyright 2016 Barracuda Networks
+Copyright 2012 The Gorilla Authors (original CSS scanner)
+Copyright 2015-2016 Barracuda Networks (thejerf/css fork: semantic token values, re-emission)
 Copyright 2016 Robert Lillack (https://github.com/roblillack)
+Copyright 2020-2026 Patrick Gundlach (https://github.com/speedata)
diff --git a/README.md b/README.md
@@ -1,74 +1,59 @@
-css
-===
+# css/scanner
 
-Forked from https://github.com/thejerf/css and added support for `local` keyword.
+A fast CSS3 tokenizer for Go.
 
-[![Build Status](https://travis-ci.org/speedata/css.svg?branch=master)](https://travis-ci.org/speedata/css)
+This package tokenizes CSS input into a stream of typed tokens (identifiers, strings, numbers, dimensions, URLs, comments, etc.) following the CSS Syntax specification. It is intended to be used by a lexer or parser.
 
+## Origin
 
-A CSS3 tokenizer.
+Originally based on the [Gorilla CSS scanner](http://www.gorillatoolkit.org/pkg/css/scanner), significantly reworked by [thejerf/css](https://github.com/thejerf/css) (Barracuda Networks), then forked by [speedata](https://github.com/speedata) with further changes:
 
-This is gratefully forked from the [Gorilla CSS
-scanner](http://www.gorillatoolkit.org/pkg/css/scanner), and had
-significant and __BACKWARDS-INCOMPATIBLE__ changes applied to it.
+- CSS Syntax Level 3 support: custom properties (`--my-var`), signed numbers (`-42px`, `+3em`)
+- Hand-written scanner replacing all regex-based tokenization (~10x faster)
+- Support for `local()`, `format()`, and `tech()` function tokens
 
-Status
-======
+## Usage
 
-Jerf-standard 100% coverage, [full
-godoc](https://godoc.org/github.com/thejerf/css/scanner) and is clean by
-the standards of many linters. Run through
-[go-fuzz](https://github.com/dvyukov/go-fuzz). I have shipped
-production-quality software on it, thought as I write this it's not too
-heavy a workout yet.
+```go
+import "github.com/speedata/css/scanner"
 
-Semantic versioning is being used, so this may also be imported via
-`gopkg.in/thejerf/css.v1/scanner`.
+s := scanner.New(input)
+for {
+    token := s.Next()
+    if token.Type == scanner.EOF || token.Type == scanner.Error {
+        break
+    }
+    // token.Type, token.Value, token.Line, token.Column
+}
+```
 
-Accepting PRs if you have them.
+## Token types
 
-Starting with the commit after dad94e3e4d, I will be signing this repo
-with the [jerf keybase.io key](https://keybase.io/jerf).
+| Token | Example input | `.Value` |
+|-------|--------------|----------|
+| `Ident` | `color`, `-webkit-foo`, `--my-var` | `color`, `-webkit-foo`, `--my-var` |
+| `Function` | `rgb(` | `rgb` |
+| `AtKeyword` | `@media` | `media` |
+| `Hash` | `#fff` | `fff` |
+| `String` | `"hello"` | `hello` |
+| `Number` | `42`, `-3.14`, `+0.5` | `42`, `-3.14`, `+0.5` |
+| `Percentage` | `50%` | `50` |
+| `Dimension` | `12px`, `-1.5em` | `12px`, `-1.5em` |
+| `URI` | `url('bg.png')` | `bg.png` |
+| `Local` | `local('Font')` | `Font` |
+| `Format` | `format('woff2')` | `woff2` |
+| `Tech` | `tech('color-SVG')` | `color-SVG` |
+| `UnicodeRange` | `U+0042` | `U+0042` |
+| `S` | `   ` | `   ` |
+| `Comment` | `/* text */` | ` text ` |
+| `Delim` | `:`, `,`, `{` | `:`, `,`, `{` |
 
-Versions
-========
+Tokens are post-processed to contain semantic values: CSS escapes are resolved, quotes and delimiters are stripped. Tokens can be re-emitted to valid CSS via `token.Emit(w)`.
 
-1. 1.0.1 - June 21, 2016
-  * Fix issue with over-consuming strings delimited by apostrophes.
-1. 1.0.0
-  * Initial release.
+## Error handling
 
-Backwards Incompatibility With Gorilla
-======================================
+Following the CSS specification, errors only occur for unclosed quotes or unclosed comments. Everything else is tokenizable; it is up to a parser to make sense of the token stream.
 
-This codebase has been made heavily backwards-incompatible to the original
-codebase. The tokens emitted by this scanner are
-post-processed into their "actual" value... that is, the CSS identifiers
-`test` and `te\st` will both yield an Ident token containing `test`.
-The URL token will contain the literal URL, with the CSS encoding processed
-away. Etc. Code to correctly emit legal tokens has also been added.
+## License
 
-I've also taken the liberty of exporting the `Type` (`TokenType` in
-Gorilla's version), which turns out to be pretty useful for external
-processors. To reduce code stuttering, the Tokens have been renamed to
-remove the `Token` prefix, and `TokenChar` is now `TokenDelim`, as that is
-what CSS calls it. (Even if I tend to agree `TokenChar` makes more sense,
-for this sort of code, best to stick to the standard.)
-
-It turns out the combination of tokens having their "actual" value,
-exposing the token types, and having code to re-emit the CSS has made
-this useful to other people. If that's what you need, well, here it is.
-
-On The Utility of Godoc.org
-===========================
-
-This project taught to me to [search on godoc.org](https://godoc.org/) for Go
-packages rather than Google. Google only showed the Gorilla tokenizer,
-which I could tell I needed many changes to make work. Much later,
-search on godoc, and had I found the [benbjohnson css
-parser](https://github.com/benbjohnson/css) I probably would have used that
-instead. By the time I found it, it was too late to switch practically.
-
-That said, I _am_ still using this in what is now a production environment
-for a non-trivial application, so for all I just said, this is a serious
-codebase.
+BSD 3-Clause. See [LICENSE](LICENSE) for details.
diff --git a/scanner/doc.go b/scanner/doc.go
@@ -1,32 +1,30 @@
-// Copyright 2012 The Gorilla Authors, Copyright 2015 Barracuda Networks.
+// Copyright 2012 The Gorilla Authors, Copyright 2015 Barracuda Networks,
+// Copyright 2020-2026 Patrick Gundlach.
 // All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 /*
-Package scanner generates tokens for a CSS2/3 input.
-
-It is a CSS2 scanner with bits of a CSS3 scanner in it.
+Package scanner tokenizes CSS input following the CSS Syntax specification.
 
 To use it, create a new scanner for a given CSS string and call Next() until
 the token returned has type scanner.EOF or scanner.Error:
 
-	s := scanner.New(myCSS)
+	s := scanner.New(input)
 	for {
 		token := s.Next()
-		if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
+		if token.Type == scanner.EOF || token.Type == scanner.Error {
 			break
 		}
-		// Do something with the token...
+		// Use token.Type, token.Value, token.Line, token.Column
 	}
 
-Following the CSS3 specification, an error can only occur when the scanner
-finds an unclosed quote or unclosed comment. In these cases the text becomes
-"untokenizable". Everything else is tokenizable and it is up to a parser
-to make sense of the token stream (or ignore nonsensical token sequences).
+Token values are post-processed to contain semantic content: CSS escapes are
+resolved, quotes are stripped from strings, and delimiters are removed from
+functions and URLs. Tokens can be re-emitted to valid CSS via token.Emit(w).
 
-Note: the scanner doesn't perform lexical analysis or, in other words, it
-doesn't care about the token context. It is intended to be used by a
-lexer or parser.
+Following the CSS specification, an error can only occur when the scanner
+finds an unclosed quote or unclosed comment. Everything else is tokenizable
+and it is up to a parser to make sense of the token stream.
 */
 package scanner
diff --git a/scanner/fuzz.go b/scanner/fuzz.go
diff --git a/scanner/fuzz_test.go b/scanner/fuzz_test.go
@@ -0,0 +1,103 @@
+package scanner
+
+import (
+	"bytes"
+	"testing"
+	"unicode/utf8"
+)
+
+// FuzzScanner tests that the scanner does not crash or panic on any valid
+// UTF-8 input, and that each token individually survives an emit → re-parse
+// round-trip.
+//
+// Full-stream round-trip (emit all tokens, reparse) is not tested here
+// because the emit path has known adjacency limitations: tokens can merge
+// or split when concatenated without separators.
+func FuzzScanner(f *testing.F) {
+	f.Add(`body { color: red; }`)
+	f.Add(`.container { font-size: 16px; margin: 0 auto; }`)
+	f.Add(`@font-face { font-family: 'F'; src: url('f.woff2') format('woff2'); }`)
+	f.Add(`#id .class:hover::before { content: "hello"; }`)
+	f.Add(`color: rgba(255, 128, 0 / 50%);`)
+	f.Add(`--my-var: -42px;`)
+	f.Add(`calc(100% - 20px)`)
+	f.Add(`U+0042-00FF`)
+	f.Add(`/* comment */ <!-- -->`)
+	f.Add(`~= |= ^= $= *=`)
+	f.Add("\uFEFF body { }")
+	f.Add(`url(/*x*/pic.png)`)
+	f.Add(`\30 x`)
+	f.Add(`bar(moo) #hash 4.2 .42 42 42% .42% 4.2% 42px`)
+
+	f.Fuzz(func(t *testing.T, input string) {
+		if !utf8.ValidString(input) {
+			return
+		}
+
+		// Phase 1: tokenize (must not crash or panic).
+		tokens, hasError := fuzzParse(input)
+		if hasError {
+			return // unclosed quote/comment — expected
+		}
+
+		// Phase 2: per-token round-trip.
+		// Each token's emitted form must reparse to the same token.
+		// Tokens with known emit limitations (escape-produced special
+		// chars) are silently skipped.
+		for _, tok := range tokens {
+			switch tok.Type {
+			case BOM, EOF, Error:
+				continue
+			}
+			// Skip tokens whose values contain characters that can't
+			// survive the emit → reparse cycle:
+			// - Backslashes in raw-emit tokens (re-interpreted as escapes)
+			// - Control chars and whitespace (from hex escapes like \0, \A, \20)
+			if hasUnsafeChars(tok.Value) {
+				continue
+			}
+			var buf bytes.Buffer
+			if err := tok.Emit(&buf); err != nil {
+				continue
+			}
+			reparsed, parseErr := fuzzParse(buf.String())
+			if parseErr || len(reparsed) != 1 {
+				continue // emit limitation, not a scanner bug
+			}
+			if reparsed[0].Type != tok.Type {
+				continue // type change from emit limitation
+			}
+			if reparsed[0].Value != tok.Value {
+				t.Fatalf("Per-token round-trip value changed for %s:\n  original: %q\n  emitted:  %q\n  reparsed: %q\n  input:    %q",
+					tok.Type, tok.Value, buf.String(), reparsed[0].Value, input)
+			}
+		}
+	})
+}
+
+// hasUnsafeChars reports whether s contains characters that cannot
+// survive the emit → reparse cycle: control chars, whitespace, or
+// backslashes (which raw-emit tokens don't escape).
+func hasUnsafeChars(s string) bool {
+	for i := range len(s) {
+		if s[i] <= 0x20 || s[i] == 0x7F || s[i] == '\\' {
+			return true
+		}
+	}
+	return false
+}
+
+func fuzzParse(input string) ([]Token, bool) {
+	var tokens []Token
+	s := New(input)
+	for {
+		tok := s.Next()
+		if tok.Type == Error {
+			return nil, true
+		}
+		if tok.Type == EOF {
+			return tokens, false
+		}
+		tokens = append(tokens, *tok)
+	}
+}
diff --git a/scanner/go.mod b/scanner/go.mod
@@ -1,3 +1,3 @@
 module github.com/speedata/css/scanner
 
-go 1.14
+go 1.24
diff --git a/scanner/runfuzz.sh b/scanner/runfuzz.sh
diff --git a/scanner/samples/lotsa_tokens b/scanner/samples/lotsa_tokens