Lexer

-- Turn a character string into a token string.
--
-- Implemented as output automaton.

module Lexer where

open import Prelude

data Symbol : Type where
  lpar   : Symbol  -- '('
  rpar   : Symbol  -- ')'
  colon  : Symbol  -- ':'
  arrow  : Symbol  -- '→'
  lambda : Symbol  -- 'λ'

-- Decidable equality for Symbol

_≟_ : (x y : Symbol) → Dec (x ≡ y)
-- Same symbols are equal
lpar   ≟ lpar   = yes refl
rpar   ≟ rpar   = yes refl
colon  ≟ colon  = yes refl
arrow  ≟ arrow  = yes refl
lambda ≟ lambda = yes refl
-- Others are inequal
lpar   ≟ rpar   = no λ ()
lpar   ≟ colon  = no λ ()
lpar   ≟ arrow  = no λ ()
lpar   ≟ lambda = no λ ()
rpar   ≟ lpar   = no λ ()
rpar   ≟ colon  = no λ ()
rpar   ≟ arrow  = no λ ()
rpar   ≟ lambda = no λ ()
colon  ≟ lpar   = no λ ()
colon  ≟ rpar   = no λ ()
colon  ≟ arrow  = no λ ()
colon  ≟ lambda = no λ ()
arrow  ≟ lpar   = no λ ()
arrow  ≟ rpar   = no λ ()
arrow  ≟ colon  = no λ ()
arrow  ≟ lambda = no λ ()
lambda ≟ lpar   = no λ ()
lambda ≟ rpar   = no λ ()
lambda ≟ colon  = no λ ()
lambda ≟ arrow  = no λ ()

data Token : Type where
  symbol : (t : Symbol) → Token
  ident  : (x : String) → Token  -- non-empty string of non-space non-symbol characters

private

  -- We have just "one" state, but it can store an identifier prefix of arbitrary length.
  -- Formally, this is not a finite automaton.

  data State : Type where
    state : (rev-chars : List Char) → State  -- characters that will form an ident

  pattern initial-state = state []

  -- The result of a step is a new step and a (possibly empty) list of tokens
  -- output by the automaton during the step.

  -- Push a new character onto the stack of saved characters.

  save-char : (c : Char) (s : State) → State
  save-char c (state rev-chars) = state (c ∷ rev-chars)

  record Step : Type where
    constructor stepped; no-eta-equality; pattern
    field
      new-state : State
      output    : List Token
  open Step

  -- Output the stored characters (if any) as identifier token.

  cons-ident : (rev-chars : List Char) → List Token → List Token
  cons-ident [] = id
  cons-ident rev-chars = ident (String.fromList (reverse rev-chars)) ∷_
    -- here we lose the information that cs is non-empty

  -- Output the given tokens, possibly prefixed by an identifier token
  -- harvested from the state.

  step-symbol : (s : State) (t : List Token) → Step
  step-symbol (state rev-chars) ts = stepped initial-state (cons-ident rev-chars ts)

  -- Transition function of the automaton.

  step : (s : State) (c : Char) → Step
  step s '(' = step-symbol s (symbol lpar   ∷ [])
  step s ')' = step-symbol s (symbol rpar   ∷ [])
  step s ':' = step-symbol s (symbol colon  ∷ [])
  step s '→' = step-symbol s (symbol arrow  ∷ [])
  step s 'λ' = step-symbol s (symbol lambda ∷ [])
  step s c = if isSpace c then step-symbol s [] else stepped (save-char c s) []

  -- Iterating the transition function.

  run : (s : State) (cs : List Char) → List Token
  run s (c ∷ cs) = case step s c of λ where
    (stepped s' ts) → ts ++ run s' cs
  run (state rev-chars) [] = cons-ident rev-chars []

-- Entrypoint: fully lex the given string.

lex : (s : String) → List Token
lex s = run initial-state (String.toList s)

-- Test

app-string : String
app-string = "x x"

app-tokens : List Token
app-tokens = lex app-string

id-string : String
id-string = "λx→x"

id-tokens : List Token
id-tokens = lex id-string

id-typed-string : String
id-typed-string = "λ (x : (ℕ → ℕ) → ℕ → ℕ) → x"

id-typed-tokens : List Token
id-typed-tokens = lex id-typed-string

one-string : String
one-string = "λ x → λ y → x y"

one-tokens : List Token
one-tokens = lex one-string

one-par-string : String
one-par-string = "λ x → λ y → x(y)"

one-par-tokens : List Token
one-par-tokens = lex one-par-string

K-string : String
K-string = "λ x → λ y → x"

K-tokens : List Token
K-tokens = lex K-string

S-string : String
S-string = "λ x → λ y → λ z → x z (y z)"

S-tokens : List Token
S-tokens = lex S-string

sapp-string : String
sapp-string = "λ x → x x"

sapp-tokens : List Token
sapp-tokens = lex sapp-string

zero-string : String
zero-string = "λ (f : ℕ → ℕ) → λ (x : ℕ) → x"

zero-tokens : List Token
zero-tokens = lex zero-string

add-string : String
add-string = "λ (n : (ℕ → ℕ) → ℕ → ℕ) → λ (m : (ℕ → ℕ) → ℕ → ℕ) → λ (f : ℕ → ℕ) → λ (x : ℕ) → n f (m f x)"

add-tokens : List Token
add-tokens = lex add-string