README.Data.Trie.NonDependent

------------------------------------------------------------------------
-- The Agda standard library
--
-- Example use case for a trie: a wee generic lexer
------------------------------------------------------------------------

{-# OPTIONS --cubical-compatible --sized-types #-}

module README.Data.Trie.NonDependent where

------------------------------------------------------------------------
-- Introduction

-- A Trie is a tree of values indexed by words in a finite language. It
-- allows users to quickly compute the Brzozowski derivative of that
-- little mapping from words to values.

-- In the most general case, values can depend upon the list of characters
-- that constitutes the path leading to them. Here however we consider a
-- non-dependent case (cf. README.Trie.Dependent for a dependent use case).

-- We can recognize keywords by storing the list of characters they
-- correspond to as paths in a Trie and the constructor they are decoded
-- to as the tree's values.

-- E.g.
-- [     .      ] is a root
-- [  -- m -->  ] is an m-labeled edge and is followed when reading 'm'
-- [    (X)     ] is a value leaf storing constructor X

--                     --> -- m --> -- m --> -- a --> (LEMMA)
--                    /
--       -- l --> -- e --> -- t --> (LET)
--      /
--     /    -- u --> -- t --> -- u --> -- a --> -- l --> (MUTUAL)
--    /    /
--  .< -- m --> -- o --> -- d --> -- u --> -- l --> -- e --> (MODULE)
--    \
--     -- w --> -- h --> -- e --> -- r --> -- e --> (WHERE)
--                           \
--                            --> -- n --> (WHEN)


-- after reading 'w', we get the derivative:

--  . -- h --> -- e --> -- r --> -- e --> (WHERE)
--                 \
--                  --> -- n --> (WHEN)


open import Level
open import Data.Unit
open import Data.Bool
open import Data.Char        as Char
import Data.Char.Properties  as Char
open import Data.List.Base   as List using (List; []; _∷_)
open import Data.List.Fresh  as List# using (List#; []; _∷#_)
open import Data.Maybe       as Maybe
open import Data.Product.Base as Prod using (_×_; ∃; proj₁; _,_)
open import Data.String.Base as String using (String)
open import Data.String.Properties as String using (_≟_)
open import Data.These       as These

open import Function.Base using (case_of_; _$_; _∘′_; id; _on_)
open import Relation.Nary
open import Relation.Binary.Core using (Rel)
open import Relation.Nullary.Decidable using (¬?)

open import Data.Trie Char.<-strictTotalOrder
open import Data.Tree.AVL.Value

------------------------------------------------------------------------
-- Generic lexer

record Lexer t : Set (suc t) where
  field
  -- Our lexer is parametrised over the type of tokens
    Tok : Set t

  -- Keywords are distinguished strings associated to tokens
  Keyword : Set t
  Keyword = String × Tok

  -- Two keywords are considered distinct if the strings are not equal
  Distinct : Rel Keyword 0ℓ
  Distinct a b = ⌊ ¬? ((proj₁ a) String.≟ (proj₁ b)) ⌋

  field

  -- We ask users to provide us with a fresh list of keywords to guarantee
  -- that no two keywords share the same string representation
    keywords : List# Keyword Distinct

  -- Some characters are special: they are separators, breaking a string
  -- into a list of tokens. Some are associated to a token value
  -- (e.g. parentheses) others are not (e.g. space)
    breaking : Char → ∃ λ b → if b then Maybe Tok else Lift _ ⊤

  -- Finally, strings which are not decoded as keywords are coerced
  -- using a function to token values.
    default  : String → Tok


module _ {t} (L : Lexer t) where
  open Lexer L

  tokenize : String → List Tok
  tokenize = start ∘′ String.toList where

   mutual

    -- A Trie is defined for an alphabet of strictly ordered letters (here
    -- we have picked Char for letters and decided to use the strict total
    -- order induced by their injection into ℕ as witnessed by the statement
    -- open import Data.Trie Char.strictTotalOrder earlier in this file).

    -- It is parametrised by a set of Values indexed over list of letters.
    -- Because we focus on the non-dependent case, we pick the constant
    -- family of Value uniformly equal to Tok. It is trivially compatible
    -- with the notion of equality underlying the strict total order on Chars.

    Keywords : Set _
    Keywords = Trie (const _ Tok) _

    -- We build a trie from the association list so that we may easily
    -- compute the successive derivatives obtained by eating the
    -- characters one by one

    init : Keywords
    init = fromList $ List.map (Prod.map₁ String.toList) $ proj₁ $ List#.toList keywords

    -- Kickstart the tokeniser with an empty accumulator and the initial
    -- trie.
    start : List Char → List Tok
    start = loop [] init

    -- The main loop
    loop : (acc  : List Char)  → -- chars read so far in this token
           (toks : Keywords)   → -- keyword candidates left at this point
           (input : List Char) → -- list of chars to tokenize
           List Tok
    -- Empty input: finish up, check whether we have a non-empty accumulator
    loop acc toks []         = push acc []
    -- At least one character
    loop acc toks (c ∷ cs)   = case breaking c of λ where
      -- if we are supposed to break on this character, we do
      (true , m)  → push acc $ maybe′ _∷_ id m $ start cs
      -- otherwise we see whether it leads to a recognized keyword
      (false , _) → case lookupValue toks (c ∷ []) of λ where
        -- if so we can forget about the current accumulator and
        -- restart the tokenizer on the rest of the input
        (just tok) → tok ∷ start cs
        -- otherwise we record the character we read in the accumulator,
        -- compute the derivative of the map of keyword candidates and
        -- keep going with the rest of the input
        nothing    → loop (c ∷ acc) (lookupTrie toks c) cs

    -- Grab the accumulator and, unless it is empty, push it on top of
    -- the decoded list of tokens
    push : List Char → List Tok → List Tok
    push [] ts = ts
    push cs ts = default (String.fromList (List.reverse cs)) ∷ ts


------------------------------------------------------------------------
-- Concrete instance

-- A small set of keywords for a language with expressions of the form
-- `let x = e in b`.

module LetIn where

  data TOK : Set where
    LET EQ IN : TOK
    LPAR RPAR : TOK
    ID : String → TOK

  keywords : List# (String × TOK) (λ a b → ⌊ ¬? ((proj₁ a) String.≟ (proj₁ b)) ⌋)
  keywords =  ("let" , LET)
           ∷# ("="   , EQ)
           ∷# ("in"  , IN)
           ∷# []

  -- Breaking characters: spaces (thrown away) and parentheses (kept)
  breaking : Char → ∃ (λ b → if b then Maybe TOK else Lift 0ℓ ⊤)
  breaking c = if isSpace c then true , nothing else parens c where

    parens : Char → _
    parens '(' = true , just LPAR
    parens ')' = true , just RPAR
    parens _   = false , _

  default : String → TOK
  default = ID

letIn : Lexer 0ℓ
letIn = record { LetIn }

open import Agda.Builtin.Equality

-- A test case:

open LetIn
_ : tokenize letIn "fix f x = let b = fix f in (f b) x"
  ≡ ID "fix"
  ∷ ID "f"
  ∷ ID "x"
  ∷ EQ
  ∷ LET
  ∷ ID "b"
  ∷ EQ
  ∷ ID "fix"
  ∷ ID "f"
  ∷ IN
  ∷ LPAR
  ∷ ID "f"
  ∷ ID "b"
  ∷ RPAR
  ∷ ID "x"
  ∷ []
_ = refl