Skip to content

Commit a009f33

Browse files
committed
Use CharSet to detect allowed identifier characters
1 parent 3629799 commit a009f33

2 files changed

Lines changed: 37 additions & 11 deletions

File tree

language-ecmascript.cabal

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Category: Language
1717
Build-Type: Simple
1818
Synopsis: JavaScript parser and pretty-printer library
1919
Description:
20-
Tools for working with ECMAScript 3 (popularly known as JavaScript).
20+
Tools for working with ECMAScript 3 (popularly known as JavaScript).
2121
Includes a parser, pretty-printer, tools for working with source tree
2222
annotations and an arbitrary instance. See CHANGELOG for a summary of
2323
changes. The package follows the Haskell Package Versioning Policy since version 0.17.0.1.
@@ -45,12 +45,13 @@ Library
4545
QuickCheck >= 2.5 && < 3,
4646
template-haskell >= 2.7 && < 3,
4747
Diff == 0.3.*,
48-
testing-feat >= 0.4.0.2 && < 0.5
48+
testing-feat >= 0.4.0.2 && < 0.5,
49+
charset >= 0.3
4950
ghc-options:
5051
-fwarn-incomplete-patterns
5152
Exposed-Modules:
52-
Language.ECMAScript3
53-
Language.ECMAScript3.Lexer
53+
Language.ECMAScript3
54+
Language.ECMAScript3.Lexer
5455
Language.ECMAScript3.Parser
5556
Language.ECMAScript3.PrettyPrint
5657
Language.ECMAScript3.Syntax

src/Language/ECMAScript3/Lexer.hs

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ module Language.ECMAScript3.Lexer(lexeme,identifier,reserved,operator,reservedOp
1414

1515
import Prelude hiding (lex)
1616
import Data.Char
17+
import Data.Monoid ((<>), mconcat)
18+
import qualified Data.CharSet as Set
19+
import qualified Data.CharSet.Unicode.Category as Set
1720
import Text.Parsec
1821
import qualified Text.Parsec.Token as T
1922
import Language.ECMAScript3.Parser.State
@@ -22,14 +25,36 @@ import Control.Monad.Identity
2225
import Control.Applicative ((<$>), (<*>))
2326
import Data.Maybe (isNothing)
2427

25-
jsLetter :: (Stream s m Char) => ParsecT s u m Char
26-
jsLetter = satisfy (\x -> isAlpha x && x < '\65536') <?> "letter"
27-
28-
jsAlphaNum :: (Stream s m Char => ParsecT s u m Char)
29-
jsAlphaNum = satisfy (\x -> isAlphaNum x && x < '\65536') <?> "letter or digit"
28+
identifierStartCharSet :: Set.CharSet
29+
identifierStartCharSet =
30+
(filterBmpChars $ mconcat
31+
[ Set.fromDistinctAscList "$_"
32+
, Set.lowercaseLetter
33+
, Set.uppercaseLetter
34+
, Set.titlecaseLetter
35+
, Set.modifierLetter
36+
, Set.otherLetter
37+
, Set.letterNumber
38+
])
39+
40+
identifierRestCharSet :: Set.CharSet
41+
identifierRestCharSet =
42+
identifierStartCharSet
43+
<> (filterBmpChars $ mconcat
44+
[ Set.nonSpacingMark
45+
, Set.spacingCombiningMark
46+
, Set.decimalNumber
47+
, Set.connectorPunctuation
48+
])
49+
50+
filterBmpChars :: Set.CharSet -> Set.CharSet
51+
filterBmpChars = Set.filter (< '\65536')
3052

3153
identifierStart :: Stream s Identity Char => Parser s Char
32-
identifierStart = jsLetter <|> oneOf "$_"
54+
identifierStart = satisfy (flip Set.member identifierStartCharSet) <?> "letter, '$', '_'"
55+
56+
identifierRest :: Stream s Identity Char => Parser s Char
57+
identifierRest = satisfy (flip Set.member identifierRestCharSet) <?> "letter, digits, '$', '_' ..."
3358

3459
javascriptDef :: Stream s Identity Char =>T.GenLanguageDef s ParserState Identity
3560
javascriptDef =
@@ -38,7 +63,7 @@ javascriptDef =
3863
"//"
3964
False -- no nested comments
4065
identifierStart
41-
(jsAlphaNum <|> oneOf "$_") -- identifier rest
66+
identifierRest
4267
(oneOf "{}<>()~.,?:|&^=!+-*/%!") -- operator start
4368
(oneOf "=<>|&+") -- operator rest
4469
["break", "case", "catch", "const", "continue", "debugger",

0 commit comments

Comments
 (0)