util.nix/package/hemar/parser/hemar.sh

#!/bin/dash

log notice "running"

# segmented-path
#   segment
# Syntax scheme:
#
# hemar
#   elements
#
# elements
#   element
#   element elements
#
# element
#   tag
#   text
#
# text
#   text-item
#   text-item text
#
# text-item
#   '0020' . '10FFFF' - '{'
#   nopatern
#
# tag
#   '{[' ws path           ws ']}'
#   '{[' ws for            ws ']}'
#   '{[' ws "done"         ws ']}'
#   '{[' ws '{['           ws ']}'
#
# # loop tag
# for
#   "for" ws string ws "in" ws path
#
# # path
# path
#   '.'
#   segmented-path
#
# segmented-path
#   segment
#   segment '.' segmented-path
#
# segment
#   string
#   index
#
# index
#   '['     digit           ']'
#   '['     onenine digits  ']'
#   '[' '-' onenine         ']'
#   '[' '-' onenine digits  ']'
#
# # types
# string
#   unquoted-string
#   quoted-string
#
# unquoted-string
#   unquoted-character
#   unquoted-character quoted-string
#
# unquoted-character
#   '0020' . '10FFFF' - '"' - '\' - '.' - '[' - ']' - '{' - '}'
#
# quoted-string
#   unquoted-character
#   unquoted-character string
#
# quoted-character
#   '0000' . '10FFFF' - '"'
#   '"' '"'
#
# digits
#   digit
#   digit digits
#
# digit
#   '0'
#   onenine
#
# onenine
#   '1' . '9'
#
# # paterns
# ws
#   ''
#   '\x20' ws
#   '\x0a' ws
#   '\x0d' ws
#   '\x09' ws
#
# nopatern
#   '{' '0020' . '10FFFF' - '['


# AST Plex:
#
# Type = 0..=5
#
# Text = string            # just a text body
#
# Interpolation = string   # path to variable
#
# Section = {
#   v = string      # item variable name for loop
#   p = string      # path to array for iteration
#   b = [Element]   # section body
#
# }
#
# End = null
#
# Compute = {
#   l     = string  # programing language
#   b     = string  # function body
# }
#
# Element = {
#   t = Type        # element type
#   b = Text        # element body
#       | Interpolation
#       | Section
#       | End
#       | Include
#       | Compute
# }
#
# AbstarctSyntaxTree (ATS) = {
#    e = [Element]  # elements array
# }

AST=$(mktemp)
AST_key='.'
trap 'rm -f "$AST"' EXIT INT HUP

yq -o j -i '.' "$AST"

log debug "AST path: ${WHITE}${AST}"

# 0 - text
# 1 - deside tag type
# 2 - interpolation
# 3 - section
# 4 - include
# 5 - compute
STAGE=0

# is_ws(char) -> bool
is_ws() {
  ord=$(printf '%d' "'$1")
  case $ord in
    32|10|13|9) # <-> \x20 | \x0a | \x0d | \x09 <-> space | \n | \r | \t
      return 0
    ;;
  esac
  return 1
}

log_buffers() {
  log debug "buff 1: $WHITE$(cat "$STAGE_BUFFER_1")"
}

# remove_last_double_quote(text) -> text
remove_last_double_quote() {
  printf '%s' "$1" | sed 's/\(.*\)"\(.*\)/\1\2/'
}

#buf_read(buf?) -> text
buf_read() {
  local buf
  if [ ${1+x} ]; then
    buf=${1}
  else
    buf=${CURRENT_STAGE_BUFFER}
  fi

  cat "$buf"
}

buf_reset() {
  : > "$STAGE_BUFFER_1"

  CURRENT_STAGE_BUFFER="$STAGE_BUFFER_1"
}

STAGE_BUFFER_1="$(mktemp)"
CURRENT_STAGE_BUFFER=$STAGE_BUFFER_1
trap 'rm -f "$STAGE_BUFFER_1"' EXIT INT HUP
log debug "stage buffer 1: ${WHITE}$STAGE_BUFFER_1"

# json_escape(value) -> str
json_escape() {
  # TODO: escape functionality
  printf '%s' "${1}" | sed 's/"/\\"/g'
}

# finds close pattern and store the char to the stage buffers separating by spaces
parse_tag() {
  local char="${1:?}"
  # NOTE: any return 1 - skip char, regular_char + return 1 - write char
  # TAG_seen_first_ws     - we've already handled the first whitespace after `{[...]`
  # TAG_in_ws_run      - we’re currently in a run of whitespace chars
  # TAG_pending_close  - we saw `]` and are checking if the next char is `}`

  string_grammar() {
    if [ "${TAG_in_quoted_string+x}" ]; then
      if [ "${TAG_end_quote_pending+x}" ]; then
        case "$char" in
          '"')
            unset TAG_end_quote_pending
          ;;
          '.')
            TAG_dote=1
	    return 1
          ;;
          *)  log error "unexpected end of quote on $WHITE$LINE_N$NC:$WHITE$CHAR_N" ;;
        esac
      elif [ "$char" = '"' ]; then
        TAG_end_quote_pending=1
        return 1
      fi
    else

      # shellcheck disable=SC1003
      case "$char" in
        '['|']'|'{'|'}'|'"'|'\')
          log error "not allowed character $WHITE$char$NC on $WHITE$LINE_N$NC:$WHITE$CHAR_N"
          log error "try to use quoted string"
        ;;
        '.')
          TAG_dote=1
	  return 1
        ;;
      esac
    fi
  }

  write_char() {
    [ ${TAG_next_argument_redgect+x} ] && {
      log error "too many argument for tag type $WHITE${TAG_type:?}$NC on $WHITE$LINE_N$NC:$WHITE$CHAR_N$NC";
      exit 1;
    }
    [ ${TAG_in_ws_run+x} ] && {
        unset TAG_in_ws_run
        if [ "${TAG_seen_first_ws+x}" ]; then
          case "${TAG_type:-unknown}" in
            unknown) finalize_first_arg ;;
            for)
              # NOTE:
	      # grammar: for i in key."subkey" ; so we know
              # 1st argument after `for` - string (name of variable)
              # 2nd                      - 'in'   (just keyword)
              # 3rd                      - path   (path to array in Model)
              case ${TAG_grammar_mode:-1} in
                string)
                ;;
                kw_in)
                ;;
                path)
                ;;
              esac
            ;;
            *) log panic 'unexpected TAG_type'; exit 13; ;;
          esac

          # NOTE: prepare to next argument
          buf_reset
        else
          TAG_seen_first_ws=1
        fi
    }
    printf '%s' "$1" >> "$CURRENT_STAGE_BUFFER"
  }

  if   [ ! "${TAG_pending_close+x}" ] && [ "$char" = ']' ]; then
    TAG_pending_close=1
    # NOTE: skip ']' but remember to check next char for a possible '}'
    return 1
  elif [ "${TAG_pending_close+x}" ]; then
    unset TAG_pending_close
    if [ "$char" = '}' ]; then
      # NOTE: found `]}` — finish bracket parsing
      return 0
    else
      # NOTE: `]` was not followed by `}`, so emit the `]` we skipped
      printf ']' >> "$CURRENT_STAGE_BUFFER"
    fi
  else
    is_ws "$char" && { TAG_in_ws_run=1; return 1; }
  fi

  case "${TAG_grammar_mode:-unknown}" in
    unknown)
      # NOTE: we always know grammar mode but first argument
      # just regular parse as string or as path if seen unquoted '.'

      # NOTE: this is after char's checked on ws
      # so if TAG_in_ws_run exists then this is first char in argument (just after ws)
      [ "${TAG_dote+x}" ] && { log panic "TAG_dote true in unknown TAG_grammar_mode"; exit 13; }
      if [ "${TAG_in_ws_run+x}" ] && [ "$char" = '"' ]; then
        [ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true right after ws"; exit 13; }
        TAG_in_quoted_string=1
        return 1
      fi

      string_grammar || return 1
      if [ ${TAG_dote+x} ]; then
	TAG_grammar_mode=path
      fi
    ;;
    path)
      if [ "${TAG_dote+x}" ]; then
	log notice "suka"
      fi

      if [ "${TAG_in_ws_run+x}" ] && [ "$char" = '"' ] || [ "${TAG_dote+x}" ] && [ "$char" = '"' ]; then
        [ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true right after ws"; exit 13; }
        unset TAG_dote
        TAG_in_quoted_string=1
        return 1
      fi

      [ "${TAG_dote+x}" ] && unset TAG_dote


      string_grammar || return 1
    ;;
    string)
      if [ "${TAG_in_ws_run+x}" ] && [ "$char" = '"' ]; then
        [ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true right after ws"; exit 13; }
        TAG_in_quoted_string=1
        return 1
      fi

      string_grammar || return 1
      if [ ${TAG_dote+x} ]; then
        log error ". not allowed, use quote to escape it; on $WHITE$LINE_N$NC:$WHITE$CHAR_N$NC"
      fi
    ;;
    kw_in)
    ;;
    *) log panic 'unexpected TAG_grammar_mode'; exit 13; ;;
  esac
  write_char    "$char"

  return 1
}

finalize_first_arg() {
  case "$(cat "$CURRENT_STAGE_BUFFER")" in
    for)
      TAG_type='for'
      # NOTE: we know that next argument after `for` is string
      TAG_grammar_mode=string
      log error 'for unimplemented'
      exit 13
    ;;
    done)
      TAG_type='done'
      TAG_next_argument_redgect=1
      # NOTE: Do not save {[ done ]} to the AST becouse it is useless there
    ;;
    '{[')
      TAG_type='actual bracket'
      TAG_next_argument_redgect=1
      if yq -e "${AST_key}[-1].type == \"text\"" "$AST" > /dev/null; then
        yq -o j -i "${AST_key}[-1].value += \"{[\"" "$AST"
      else
        yq -o j -i "$AST_key += [{
          \"type\": \"text\",
          \"value\": \"{[\"
        }]" "$AST"
      fi
    ;;
    *)         # interpolation tag
      TAG_type='interpolation'
      TAG_next_argument_redgect=1
      buf=$(cat "$STAGE_BUFFER_1")
      yq -o j -i "$AST_key += [{
        \"type\": \"interpolation\",
        \"path\": \"$(json_escape "$buf")\"
      }]" "$AST"
    ;;
  esac
}

# finds open pattern and stores the char to the STAGE_BUFFER_1
find_open_pattern() {
  local char="${1:?}"
  if   [ ! "${open_tag_flag+x}" ] && [ "$char" = '{' ]; then
    open_tag_flag=1
  elif [ "${open_tag_flag+x}" ]; then
    unset open_tag_flag
    if [ "$char" = '[' ]; then
      return 0
    else
      printf '{%s' "$char" >> "$CURRENT_STAGE_BUFFER"
    fi
  else
    printf '%s' "$char" >> "$CURRENT_STAGE_BUFFER"
  fi

  return 1
}

parse() {
  char="$1"

  case "$STAGE" in
    # Text Stage - save char in STAGE_BUFFER_1 until next tag opens
    0)
      if find_open_pattern "$char"; then
        log debug "open pattern founded"
        buf=$(cat "$CURRENT_STAGE_BUFFER")
        yq -o j -i "$AST_key += [{
          \"type\": \"text\",
          \"value\": \"$(json_escape "$buf")\"
        }]" "$AST"

        buf_reset
        STAGE=1
      fi
    ;;
    1)
      if parse_tag "$char"; then
        log_buffers

        # zero-initialization

        buf_reset
        STAGE=1
      fi
    ;;
    2)

    ;;
    3)

    ;;
    4)

    ;;
    *)
      log error "error: ${WHITE}impossible stage"
      exit 13
    ;;
  esac
}

while [ $# -gt 0 ]; do
  case $1 in
    -c|--compact-output)
      OUTPUT_ARGS="${OUTPUT_ARGS+$OUTPUT_ARGS }-I=0"
      shift
    ;;
    --*|-*)
      log error "argument $1 does not exists"
      exit 9
    ;;
    *)
      log error "subcommand $1 does not exists"
      exit 9
    ;;
  esac
done

CHAR_N=1
LINE_N=1
#LINE_NUMBER=1
while :; do
    # read exactly 1 byte; preserve newlines
    if ! char="$(dd bs=1 count=1 2>/dev/null)"; then
        break
    fi

    # NOTE: if $char is empty, it because `dd` returned '\n' but `$(...)`
    # removed it as trailing '\n', so I set $char as '\n' here
    [ -z "$char" ] && {
        LINE_N=$((LINE_N+1))
        char='
'
    }

    parse "${char:?}"

    CHAR_N=$((CHAR_N+1))
done

# finish TEXT tag if file ends on it
if [ "$STAGE" -eq 0 ]; then
  if [ "${open_tag_flag+x}" ]; then
    unset open_tag_flag
    printf '{' >> "$STAGE_BUFFER_1"
  fi

  buf=$(cat "$STAGE_BUFFER_1")
  yq -o j -i "$AST_key += [{
    \"type\": \"text\",
    \"value\": \"$(json_escape "$buf")\"
  }]" "$AST"
fi

# return the output
# shellcheck disable=SC2086
yq ${OUTPUT_ARGS:-} -o j "$AST"