Parser for IANA language subtag registry

[[ 🗃 ^OE9Go language-subtag-registry ]] :: [📥 Inbox] [📤 Outbox] [🐤 Followers] [🤝 Collaborators] [🛠 Changes]

Clone

HTTPS: darcs clone https://vervis.peers.community/repos/OE9Go

SSH: darcs clone USERNAME@vervis.peers.community:OE9Go

Tags

TODO

src / Data /

LanguageSubtagRegistry.hs

{- This file is part of language-subtag-registry.
 -
 - Written in 2018 by fr33domlover <fr33domlover@riseup.net>.
 -
 - ♡ Copying is an act of love. Please copy, reuse and share.
 -
 - The author(s) have dedicated all copyright and related and neighboring
 - rights to this software to the public domain worldwide. This software is
 - distributed without any warranty.
 -
 - You should have received a copy of the CC0 Public Domain Dedication along
 - with this software. If not, see
 - <http://creativecommons.org/publicdomain/zero/1.0/>.
 -}

{-# LANGUAGE OverloadedStrings #-}

module Data.LanguageSubtagRegistry
    ( TagType (..)
    , SubtagType (..)
    , Record (..)
    , Registry (..)
    , parseRegistry
    )
where

import Control.Applicative ((<|>), optional)
import Control.Arrow (second, left)
import Control.Monad ((<=<))
import Control.Monad.Trans.Except (except, runExcept)
import Data.Attoparsec.Text
import Data.Char
import Data.Map (Map)
import Data.Semigroup ((<>))
import Data.Set (Set)
import Data.Text (Text)
import Data.Time.Calendar (Day)
import Data.Time.Format

import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T

data TagType = Grandfathered | Redundant
    deriving (Eq, Show)

data SubtagType = Language | ExtLang | Script | Region | Variant
    deriving (Eq, Show)

data Record = Record
    { recordType           :: Either TagType SubtagType
    , recordTag            :: Text
    , recordDescription    :: [Text]
    , recordAdded          :: Day
    , recordDeprecated     :: Maybe Day
    , recordPreferredValue :: Maybe Text
    , recordPrefix         :: [Text]
    , recordSuppressScript :: Maybe Text
    , recordMacrolanguage  :: Maybe Text
    , recordScope          :: Maybe Text
    , recordComments       :: [Text]
    }
    deriving Show

data Registry = Registry
    { registryDate :: Day
    , registryTags :: [Record]
    }
    deriving Show

fieldName :: Parser Text
fieldName =
    takeWhile1 $
    \ c -> isAsciiUpper c || isAsciiLower c || isDigit c || c == '-'

fieldSep :: Parser ()
fieldSep = skipWhile (== ' ') *> skip (== ':') *> skipWhile (== ' ')

fieldBody :: Parser Text
fieldBody = fmap fst $ match $ optional $
    let chars = skip (> ' ') *> skipWhile (> ' ')
        spaces = skip (== ' ') *> skipWhile (== ' ')
        indented =
            spaces *> optional (endOfLine *> spaces) *> chars <|>
            endOfLine *> spaces *> chars
    in  (chars <|> indented) *> skipMany indented

field :: Parser (Text, Text)
field = (,) <$> fieldName <* fieldSep <*> fieldBody <* endOfLine

record :: Parser [(Text, Text)]
record = many1 field

registry :: Parser [[(Text, Text)]]
registry = record `sepBy1` (string "%%" *> endOfLine)

parseRecords :: Text -> Either String [[(Text, Text)]]
parseRecords = parseOnly $ registry <* endOfInput

groupFields :: [(Text, Text)] -> Map Text (Set Text)
groupFields = M.fromListWith S.union . map (second S.singleton)

single :: Text -> Map Text (Set Text) -> Either Text Text
single k m =
    case M.lookup k m of
        Nothing -> Left $ T.concat ["Key ", k, " not found"]
        Just s  ->
            case S.elems s of
                []  -> Left $ T.concat ["Key ", k, " not found"]
                [v] -> Right v
                _   -> Left $ "Multiple values found for key " <> k

msingle :: Text -> Map Text (Set Text) -> Either Text (Maybe Text)
msingle k m =
    case M.lookup k m of
        Nothing -> Right Nothing
        Just s  ->
            case S.elems s of
                []  -> Right Nothing
                [v] -> Right $ Just v
                _   -> Left $ "Multiple values found for key " <> k

multi :: Text -> Map Text (Set Text) -> Either Text [Text]
multi k m =
    case M.lookup k m of
        Nothing -> Left $ T.concat ["Key ", k, " not found"]
        Just s  ->
            case S.elems s of
                []  -> Left $ T.concat ["Key ", k, " not found"]
                vs  -> Right vs

mmulti :: Text -> Map Text (Set Text) -> Either Text [Text]
mmulti k m =
    case M.lookup k m of
        Nothing -> Right []
        Just s  -> Right $ S.elems s

parseDate :: Text -> Either Text Day
parseDate
    = maybe (Left "Invalid date") Right
    . parseTimeM False defaultTimeLocale (iso8601DateFormat Nothing)
    . T.unpack

extractType :: Map Text (Set Text) -> Either Text (Either TagType SubtagType)
extractType m = do
    t <- single "Type" m
    let subtag v =
            if "Subtag" `M.member` m
                then Right $ Right v
                else Left $ T.concat ["Subtag type ", t, "but no Subtag field"]
        tag v =
            if "Tag" `M.member` m
                then Right $ Left v
                else Left $ T.concat ["Tag type ", t, "but no Tag field"]
    case t of
        "language"      -> subtag Language
        "extlang"       -> subtag ExtLang
        "script"        -> subtag Script
        "region"        -> subtag Region
        "variant"       -> subtag Variant
        "grandfathered" -> tag Grandfathered
        "redundant"     -> tag Redundant
        _               -> Left $ "Unrecognized type " <> t

extractRecord :: Map Text (Set Text) -> Either Text Record
extractRecord m = Record
    <$> extractType m
    <*> runExcept (except (single "Tag" m) <|> except (single "Subtag" m))
    <*> multi "Description" m
    <*> (single "Added" m >>= parseDate)
    <*> (msingle "Deprecated" m >>= traverse parseDate)
    <*> msingle "PreferredValue" m
    <*> mmulti "Prefix" m
    <*> msingle "SuppressScript" m
    <*> msingle "Macrolanguage" m
    <*> msingle "Scope" m
    <*> mmulti "Comments" m

extractRegistry :: [Map Text (Set Text)] -> Either Text Registry
extractRegistry []     = Left "Empty registry, no records found"
extractRegistry (m:ms) = Registry
    <$> (single "File-Date" m >>= parseDate)
    <*> traverse extractRecord ms

parseRegistry :: Text -> Either Text Registry
parseRegistry =
    extractRegistry <=< fmap (map groupFields) . left T.pack . parseRecords
[See repo JSON]