Skip to content

Commit 2fc177c

Browse files
authored
Merge pull request #82 from berdario/unicode-categories
Use Unicode categories for lexing JS identifiers
2 parents 33b9ba4 + 8a2ed3d commit 2fc177c

2 files changed

Lines changed: 36 additions & 6 deletions

File tree

language-ecmascript.cabal

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Category: Language
1717
Build-Type: Simple
1818
Synopsis: JavaScript parser and pretty-printer library
1919
Description:
20-
Tools for working with ECMAScript 3 (popularly known as JavaScript).
20+
Tools for working with ECMAScript 3 (popularly known as JavaScript).
2121
Includes a parser, pretty-printer, tools for working with source tree
2222
annotations and an arbitrary instance. See CHANGELOG for a summary of
2323
changes. The package follows the Haskell Package Versioning Policy since version 0.17.0.1.
@@ -45,12 +45,13 @@ Library
4545
QuickCheck >= 2.5 && < 3,
4646
template-haskell >= 2.7 && < 3,
4747
Diff == 0.3.*,
48-
testing-feat >= 0.4.0.2 && < 0.5
48+
testing-feat >= 0.4.0.2 && < 0.5,
49+
charset >= 0.3
4950
ghc-options:
5051
-fwarn-incomplete-patterns
5152
Exposed-Modules:
52-
Language.ECMAScript3
53-
Language.ECMAScript3.Lexer
53+
Language.ECMAScript3
54+
Language.ECMAScript3.Lexer
5455
Language.ECMAScript3.Parser
5556
Language.ECMAScript3.PrettyPrint
5657
Language.ECMAScript3.Syntax

src/Language/ECMAScript3/Lexer.hs

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ module Language.ECMAScript3.Lexer(lexeme,identifier,reserved,operator,reservedOp
1313
,hexIntLit,decIntLit, decDigits, decDigitsOpt, exponentPart, decLit) where
1414

1515
import Prelude hiding (lex)
16+
import Data.Char
17+
import Data.Monoid ((<>), mconcat)
18+
import qualified Data.CharSet as Set
19+
import qualified Data.CharSet.Unicode.Category as Set
1620
import Text.Parsec
1721
import qualified Text.Parsec.Token as T
1822
import Language.ECMAScript3.Parser.State
@@ -21,8 +25,33 @@ import Control.Monad.Identity
2125
import Control.Applicative ((<$>), (<*>))
2226
import Data.Maybe (isNothing)
2327

28+
identifierStartCharSet :: Set.CharSet
29+
identifierStartCharSet =
30+
mconcat
31+
[ Set.fromDistinctAscList "$_"
32+
, Set.lowercaseLetter
33+
, Set.uppercaseLetter
34+
, Set.titlecaseLetter
35+
, Set.modifierLetter
36+
, Set.otherLetter
37+
, Set.letterNumber
38+
]
39+
40+
identifierRestCharSet :: Set.CharSet
41+
identifierRestCharSet =
42+
identifierStartCharSet
43+
<> mconcat
44+
[ Set.nonSpacingMark
45+
, Set.spacingCombiningMark
46+
, Set.decimalNumber
47+
, Set.connectorPunctuation
48+
]
49+
2450
identifierStart :: Stream s Identity Char => Parser s Char
25-
identifierStart = letter <|> oneOf "$_"
51+
identifierStart = satisfy (flip Set.member identifierStartCharSet) <?> "letter, '$', '_'"
52+
53+
identifierRest :: Stream s Identity Char => Parser s Char
54+
identifierRest = satisfy (flip Set.member identifierRestCharSet) <?> "letter, digits, '$', '_' ..."
2655

2756
javascriptDef :: Stream s Identity Char =>T.GenLanguageDef s ParserState Identity
2857
javascriptDef =
@@ -31,7 +60,7 @@ javascriptDef =
3160
"//"
3261
False -- no nested comments
3362
identifierStart
34-
(alphaNum <|> oneOf "$_") -- identifier rest
63+
identifierRest
3564
(oneOf "{}<>()~.,?:|&^=!+-*/%!") -- operator start
3665
(oneOf "=<>|&+") -- operator rest
3766
["break", "case", "catch", "const", "continue", "debugger",

0 commit comments

Comments
 (0)