eligibility-signposting-api/src/eligibility_signposting_api/services/processors/token_parser.py at 6fa24bca18ca2c1a6765618c918f98893f17249e · NHSDigital/eligibility-signposting-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re
from dataclasses import dataclass


@dataclass
class ParsedToken:
    """
    A class to represent a parsed token.
    ...
    Attributes
    ----------
    attribute_level : str
        Example: "PERSON" or "TARGET"
    attribute_name : str
        Example: "POSTCODE" or "RSV"
    attribute_value : int
        Example: "LAST_SUCCESSFUL_DATE" if attribute_level is TARGET
    format : str
        Example: "%d %B %Y" if DATE formatting is used
    """

    attribute_level: str
    attribute_name: str
    attribute_value: str | None
    format: str | None


class TokenParser:
    MIN_TOKEN_PARTS = 2

    @staticmethod
    def parse(token: str) -> ParsedToken:
        """Parses a token into its parts.
        Steps:
        Strip the surrounding [[ ]]
        Check for empty body after stripping, e.g., '[[]]'
        Check for empty parts created by leading/trailing dots or tokens with no dot
        Check if the name contains a date format
        Return a ParsedToken object
        """

        token_body = token[2:-2]
        if not token_body:
            message = "Invalid token."
            raise ValueError(message)

        token_parts = token_body.split(".")

        if len(token_parts) < TokenParser.MIN_TOKEN_PARTS or not all(token_parts):
            message = "Invalid token."
            raise ValueError(message)

        token_level = token_parts[0].upper()
        token_name = token_parts[-1]

        format_match = re.search(r":DATE\(([^()]*)\)", token_name, re.IGNORECASE)
        if not format_match and len(token_name.split(":")) > 1:
            message = "Invalid token format."
            raise ValueError(message)

        format_str = format_match.group(1) if format_match else None

        last_part = re.sub(r":DATE\(.*?\)", "", token_name, flags=re.IGNORECASE)

        if len(token_parts) == TokenParser.MIN_TOKEN_PARTS:
            name = last_part.upper()
            value = None
        else:
            name = token_parts[1].upper()
            value = last_part.upper()

        return ParsedToken(attribute_level=token_level, attribute_name=name, attribute_value=value, format=format_str)