Parser for IANA language subtag registry
[[ 🗃
^OE9Go language-subtag-registry
]] ::
[📥 Inbox]
[📤 Outbox]
[🐤 Followers]
[🤝 Collaborators]
[🛠 Changes]
Clone
HTTPS:
darcs clone https://vervis.peers.community/repos/OE9Go
SSH:
darcs clone USERNAME@vervis.peers.community:OE9Go
Tags
TODO
LanguageSubtagRegistry.hs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | {- This file is part of language-subtag-registry.
-
- Written in 2018 by fr33domlover <fr33domlover@riseup.net>.
-
- ♡ Copying is an act of love. Please copy, reuse and share.
-
- The author(s) have dedicated all copyright and related and neighboring
- rights to this software to the public domain worldwide. This software is
- distributed without any warranty.
-
- You should have received a copy of the CC0 Public Domain Dedication along
- with this software. If not, see
- <http://creativecommons.org/publicdomain/zero/1.0/>.
-}
{-# LANGUAGE OverloadedStrings #-}
module Data.LanguageSubtagRegistry
( TagType (..)
, SubtagType (..)
, Record (..)
, Registry (..)
, parseRegistry
)
where
import Control.Applicative ((<|>), optional)
import Control.Arrow (second, left)
import Control.Monad ((<=<))
import Control.Monad.Trans.Except (except, runExcept)
import Data.Attoparsec.Text
import Data.Char
import Data.Map (Map)
import Data.Semigroup ((<>))
import Data.Set (Set)
import Data.Text (Text)
import Data.Time.Calendar (Day)
import Data.Time.Format
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
data TagType = Grandfathered | Redundant
deriving (Eq, Show)
data SubtagType = Language | ExtLang | Script | Region | Variant
deriving (Eq, Show)
data Record = Record
{ recordType :: Either TagType SubtagType
, recordTag :: Text
, recordDescription :: [Text]
, recordAdded :: Day
, recordDeprecated :: Maybe Day
, recordPreferredValue :: Maybe Text
, recordPrefix :: [Text]
, recordSuppressScript :: Maybe Text
, recordMacrolanguage :: Maybe Text
, recordScope :: Maybe Text
, recordComments :: [Text]
}
deriving Show
data Registry = Registry
{ registryDate :: Day
, registryTags :: [Record]
}
deriving Show
fieldName :: Parser Text
fieldName =
takeWhile1 $
\ c -> isAsciiUpper c || isAsciiLower c || isDigit c || c == '-'
fieldSep :: Parser ()
fieldSep = skipWhile (== ' ') *> skip (== ':') *> skipWhile (== ' ')
fieldBody :: Parser Text
fieldBody = fmap fst $ match $ optional $
let chars = skip (> ' ') *> skipWhile (> ' ')
spaces = skip (== ' ') *> skipWhile (== ' ')
indented =
spaces *> optional (endOfLine *> spaces) *> chars <|>
endOfLine *> spaces *> chars
in (chars <|> indented) *> skipMany indented
field :: Parser (Text, Text)
field = (,) <$> fieldName <* fieldSep <*> fieldBody <* endOfLine
record :: Parser [(Text, Text)]
record = many1 field
registry :: Parser [[(Text, Text)]]
registry = record `sepBy1` (string "%%" *> endOfLine)
parseRecords :: Text -> Either String [[(Text, Text)]]
parseRecords = parseOnly $ registry <* endOfInput
groupFields :: [(Text, Text)] -> Map Text (Set Text)
groupFields = M.fromListWith S.union . map (second S.singleton)
single :: Text -> Map Text (Set Text) -> Either Text Text
single k m =
case M.lookup k m of
Nothing -> Left $ T.concat ["Key ", k, " not found"]
Just s ->
case S.elems s of
[] -> Left $ T.concat ["Key ", k, " not found"]
[v] -> Right v
_ -> Left $ "Multiple values found for key " <> k
msingle :: Text -> Map Text (Set Text) -> Either Text (Maybe Text)
msingle k m =
case M.lookup k m of
Nothing -> Right Nothing
Just s ->
case S.elems s of
[] -> Right Nothing
[v] -> Right $ Just v
_ -> Left $ "Multiple values found for key " <> k
multi :: Text -> Map Text (Set Text) -> Either Text [Text]
multi k m =
case M.lookup k m of
Nothing -> Left $ T.concat ["Key ", k, " not found"]
Just s ->
case S.elems s of
[] -> Left $ T.concat ["Key ", k, " not found"]
vs -> Right vs
mmulti :: Text -> Map Text (Set Text) -> Either Text [Text]
mmulti k m =
case M.lookup k m of
Nothing -> Right []
Just s -> Right $ S.elems s
parseDate :: Text -> Either Text Day
parseDate
= maybe (Left "Invalid date") Right
. parseTimeM False defaultTimeLocale (iso8601DateFormat Nothing)
. T.unpack
extractType :: Map Text (Set Text) -> Either Text (Either TagType SubtagType)
extractType m = do
t <- single "Type" m
let subtag v =
if "Subtag" `M.member` m
then Right $ Right v
else Left $ T.concat ["Subtag type ", t, "but no Subtag field"]
tag v =
if "Tag" `M.member` m
then Right $ Left v
else Left $ T.concat ["Tag type ", t, "but no Tag field"]
case t of
"language" -> subtag Language
"extlang" -> subtag ExtLang
"script" -> subtag Script
"region" -> subtag Region
"variant" -> subtag Variant
"grandfathered" -> tag Grandfathered
"redundant" -> tag Redundant
_ -> Left $ "Unrecognized type " <> t
extractRecord :: Map Text (Set Text) -> Either Text Record
extractRecord m = Record
<$> extractType m
<*> runExcept (except (single "Tag" m) <|> except (single "Subtag" m))
<*> multi "Description" m
<*> (single "Added" m >>= parseDate)
<*> (msingle "Deprecated" m >>= traverse parseDate)
<*> msingle "PreferredValue" m
<*> mmulti "Prefix" m
<*> msingle "SuppressScript" m
<*> msingle "Macrolanguage" m
<*> msingle "Scope" m
<*> mmulti "Comments" m
extractRegistry :: [Map Text (Set Text)] -> Either Text Registry
extractRegistry [] = Left "Empty registry, no records found"
extractRegistry (m:ms) = Registry
<$> (single "File-Date" m >>= parseDate)
<*> traverse extractRecord ms
parseRegistry :: Text -> Either Text Registry
parseRegistry =
extractRegistry <=< fmap (map groupFields) . left T.pack . parseRecords
|