Files
util.nix/package/hemar/parser/hemar.sh

654 lines
16 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/dash
# segmented-path
# segment
# Syntax scheme:
#
# hemar
# elements
#
# elements
# element
# element elements
#
# element
# tag
# text
#
# text
# text-item
# text-item text
#
# text-item
# '0020' . '10FFFF' - '{'
# nopatern
#
# tag
# '{[' ws path ws ']}'
# '{[' ws for ws ']}'
# '{[' ws "done" ws ']}'
# '{[' ws '{[' ws ']}'
#
# # loop tag
# for
# "for" ws string ws "in" ws path
#
# # path
# path
# '.'
# segmented-path
#
# segmented-path
# segment
# segment '.' segmented-path
#
# segment
# string
# index
#
# index
# '[' digit ']'
# '[' onenine digits ']'
# '[' '-' onenine ']'
# '[' '-' onenine digits ']'
#
# # types
# string
# unquoted-string
# quoted-string
#
# unquoted-string
# unquoted-character
# unquoted-character quoted-string
#
# unquoted-character
# '0020' . '10FFFF' - '"' - '\' - '.' - '[' - ']' - '{' - '}'
#
# quoted-string
# unquoted-character
# unquoted-character string
#
# quoted-character
# '0000' . '10FFFF' - '"'
# '"' '"'
#
# digits
# digit
# digit digits
#
# digit
# '0'
# onenine
#
# onenine
# '1' . '9'
#
# # paterns
# ws
# ''
# '\x20' ws
# '\x0a' ws
# '\x0d' ws
# '\x09' ws
#
# nopatern
# '{' '0020' . '10FFFF' - '['
# AST Structure:
#
# The parser outputs a JSON array of elements directly (not wrapped in an object).
#
# Element types (currently implemented):
#
# Text = {
# "type": "text",
# "value": string # text content
# }
#
# Interpolation = {
# "type": "interpolation",
# "path": [PathSegment, ...] # structured path to variable in data model
# }
#
# PathSegment = {
# "type": "root" # root path: "."
# } | {
# "type": "key",
# "key": string # key name (can contain spaces if quoted)
# } | {
# "type": "index",
# "index": number # array index (can be negative)
# }
#
# Element types (planned for MVP):
#
# Section = {
# "type": "section",
# "variable": string # item variable name for loop
# "path": string # path to array for iteration
# "body": [Element] # section body (nested elements)
# }
#
# Element types (planned for future, not MVP):
#
# Include = {
# "type": "include",
# "path": string # path to template file to include
# }
#
# Compute = {
# "type": "compute",
# "language": string # programming language (dash, plpgsql, etc.)
# "body": string # function body
# }
#
# AbstractSyntaxTree = [Element, ...] # array of elements
# is_ws(char) -> bool
is_ws() {
ord=$(printf '%d' "'$1")
case $ord in
32|10|13|9) # <-> \x20 | \x0a | \x0d | \x09 <-> space | \n | \r | \t
return 0
;;
esac
return 1
}
log_buffers() {
log debug "buff 1: $WHITE$(cat "$STAGE_BUFFER_1")"
}
# remove_last_double_quote(text) -> text
remove_last_double_quote() {
printf '%s' "$1" | sed 's/\(.*\)"\(.*\)/\1\2/'
}
#buf_read(buf?) -> text
buf_read() {
local buf
if [ ${1+x} ]; then
buf=${1}
else
buf=${CURRENT_STAGE_BUFFER}
fi
cat "$buf"
}
buf_reset() {
: > "$STAGE_BUFFER_1"
CURRENT_STAGE_BUFFER="$STAGE_BUFFER_1"
}
# json_escape(value) -> str
json_escape() {
local input="${1}"
local output=""
local char hex
while [ -n "$input" ]; do
char="${input%"${input#?}"}" # Get first character
input="${input#?}" # Remove first character
hex=$(printf '%d' "'$char")
case "$hex" in
34) output="${output}\\\"" ;; # "
92) output="${output}\\\\" ;; # \
10) output="${output}\\n" ;; # \n (newline)
13) output="${output}\\r" ;; # \r (carriage return)
9) output="${output}\\t" ;; # \t (tab)
8) output="${output}\\b" ;; # \b (backspace)
12) output="${output}\\f" ;; # \f (form feed)
*)
# NOTE(yukkop): escape control characters if they are not in the range 0x20-0x7E
if [ "$hex" -lt 32 ]; then
output="${output}\\u$(printf '%04x' "$hex")"
else
output="${output}${char}"
fi
;;
esac
done
printf '%s' "$output"
}
# finds close pattern and store the char to the stage buffers separating by spaces
parse_tag() {
local char="${1:?}"
# NOTE: any return 1 - skip char, regular_char + return 1 - write char
# TAG_seen_first_ws - we've already handled the first whitespace after `{[...]`
# TAG_in_ws_run - were currently in a run of whitespace chars
# TAG_pending_close - we saw `]` and are checking if the next char is `}`
string_grammar() {
if [ "${TAG_in_quoted_string+x}" ]; then
if [ "${TAG_end_quote_pending+x}" ]; then
case "$char" in
'"')
# Escaped quote: "" -> write single quote and continue in quoted string
unset TAG_end_quote_pending
printf '"' >> "$CURRENT_STAGE_BUFFER"
return 1
;;
'.')
# Closing quote followed by dot: exit quoted string, allow dot
unset TAG_end_quote_pending
unset TAG_in_quoted_string
TAG_dote=1
return 1
;;
']'|'}')
# Closing quote followed by bracket: exit quoted string, let bracket be handled
unset TAG_end_quote_pending
unset TAG_in_quoted_string
return 1
;;
*)
if is_ws "$char"; then
# Closing quote followed by whitespace: exit quoted string
unset TAG_end_quote_pending
unset TAG_in_quoted_string
TAG_in_ws_run=1
return 1
else
log error "unexpected character $WHITE$char$NC after closing quote on $WHITE$LINE_N$NC:$WHITE$CHAR_N"
log error "expected: whitespace, dot, or end of tag"
exit 1
fi
;;
esac
elif [ "$char" = '"' ]; then
# We see a quote inside quoted string: might be closing or escaped
TAG_end_quote_pending=1
return 1
else
# Inside quoted string, all other chars are allowed (will be written by write_char)
return 0
fi
else
# Not in quoted string: validate unquoted characters
# shellcheck disable=SC1003
case "$char" in
']'|'}'|'"'|'\')
log error "not allowed character $WHITE$char$NC on $WHITE$LINE_N$NC:$WHITE$CHAR_N"
log error "try to use quoted string"
exit 1
;;
'.')
TAG_dote=1
return 1
;;
esac
return 0
fi
}
write_char() {
[ ${TAG_next_argument_redgect+x} ] && {
log error "too many argument for tag type $WHITE${TAG_type:?}$NC on $WHITE$LINE_N$NC:$WHITE$CHAR_N$NC";
exit 1;
}
[ ${TAG_in_ws_run+x} ] && {
unset TAG_in_ws_run
if [ "${TAG_seen_first_ws+x}" ]; then
log trace "tag in ws -> type: \`${TAG_type:-}\`"
case "${TAG_type:-unknown}" in
unknown) finalize_first_arg ;;
for)
# NOTE:
# grammar: for i in key."subkey" ; so we know
# 1st argument after `for` - string (name of variable)
# 2nd - 'in' (just keyword)
# 3rd - path (path to array in Model)
case ${TAG_grammar_mode:-1} in
string)
;;
kw_in)
;;
path)
;;
esac
;;
*) log panic 'unexpected TAG_type'; exit 13; ;;
esac
# NOTE: prepare to next argument
buf_reset
else
TAG_seen_first_ws=1
fi
}
printf '%s' "$1" >> "$CURRENT_STAGE_BUFFER"
}
# Check if we need to exit quoted string due to pending quote followed by bracket
if [ "${TAG_in_quoted_string+x}" ] && [ "${TAG_end_quote_pending+x}" ]; then
if [ "$char" = ']' ] || [ "$char" = '}' ]; then
# Closing quote followed by bracket: exit quoted string, handle bracket
unset TAG_end_quote_pending
unset TAG_in_quoted_string
# Fall through to bracket handling
fi
fi
if ! [ "${TAG_in_quoted_string+x}" ]; then
if [ ! "${TAG_pending_close+x}" ] && [ "$char" = ']' ]; then
TAG_pending_close=1
# NOTE: skip ']' but remember to check next char for a possible '}'
return 1
elif [ "${TAG_pending_close+x}" ]; then
unset TAG_pending_close
if [ "$char" = '}' ]; then
finish
# NOTE: found `]}` — finish bracket parsing
return 0
else
# NOTE: `]` was not followed by `}`, so emit the `]` we skipped
printf ']' >> "$CURRENT_STAGE_BUFFER"
fi
else
is_ws "$char" && { TAG_in_ws_run=1; return 1; }
fi
fi
case "${TAG_grammar_mode:-unknown}" in
unknown)
# NOTE: we always know grammar mode but first argument
# just regular parse as string or as path if seen unquoted '.'
# NOTE: this is after char's checked on ws
# Quote is allowed: at start of tag (empty buffer), after whitespace, or after dot
[ "${TAG_dote+x}" ] && { log panic "TAG_dote true in unknown TAG_grammar_mode"; exit 13; }
if [ "$char" = '"' ]; then
# Check if we're at start (empty buffer), after whitespace, or at tag start
if [ ! -s "$CURRENT_STAGE_BUFFER" ] || [ "${TAG_in_ws_run+x}" ]; then
[ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true"; exit 13; }
TAG_in_quoted_string=1
return 1
fi
# Quote in middle of unquoted string - let string_grammar reject it
fi
string_grammar || return 1
if [ ${TAG_dote+x} ]; then
TAG_grammar_mode=path
fi
;;
path)
# Quote is allowed: after whitespace or after dot
if [ "$char" = '"' ]; then
if [ "${TAG_in_ws_run+x}" ] || [ "${TAG_dote+x}" ] || [ ! -s "$CURRENT_STAGE_BUFFER" ]; then
[ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true"; exit 13; }
[ "${TAG_dote+x}" ] && unset TAG_dote
TAG_in_quoted_string=1
return 1
fi
# Quote in middle of unquoted string - let string_grammar reject it
fi
[ "${TAG_dote+x}" ] && unset TAG_dote
string_grammar || return 1
;;
string)
# Quote is allowed: after whitespace or at segment start
if [ "$char" = '"' ]; then
if [ "${TAG_in_ws_run+x}" ] || [ ! -s "$CURRENT_STAGE_BUFFER" ]; then
[ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true"; exit 13; }
TAG_in_quoted_string=1
return 1
fi
# Quote in middle of unquoted string - let string_grammar reject it
fi
string_grammar || return 1
if [ ${TAG_dote+x} ]; then
log error ". not allowed, use quote to escape it; on $WHITE$LINE_N$NC:$WHITE$CHAR_N$NC"
fi
;;
kw_in)
;;
*) log panic 'unexpected TAG_grammar_mode'; exit 13; ;;
esac
write_char "$char"
return 1
}
finish() {
case "${TAG_type:-unknown}" in
unknown)
finish_interpolation_tag
;;
done)
finish_done_tag
;;
'{[')
finish_bracket_tag
;;
for) ;;
*) log panic 'unexpected TAG_type on finish'; exit 13; ;;
esac
}
finalize_first_arg() {
log trace "finalize first arg"
log trace "buffer: $(cat "$CURRENT_STAGE_BUFFER")"
case "$(cat "$CURRENT_STAGE_BUFFER")" in
for)
TAG_type='for'
# NOTE: we know that next argument after `for` is string
TAG_grammar_mode=string
log error 'for unimplemented'
exit 13
;;
done)
finish_done_tag
;;
'{[')
finish_bracket_tag
;;
*) # interpolation tag
finish_interpolation_tag
;;
esac
}
finish_done_tag() {
TAG_type='done'
TAG_next_argument_redgect=1
# NOTE: Do not save {[ done ]} to the AST becouse it is useless there
}
finish_bracket_tag() {
TAG_type='actual bracket'
TAG_next_argument_redgect=1
if yq -e "${AST_key}[-1].type == \"text\"" "$AST" > /dev/null; then
yq -o j -i "${AST_key}[-1].value += \"{[\"" "$AST"
else
yq -o j -i "$AST_key += [{
\"type\": \"text\",
\"value\": \"{[\"
}]" "$AST"
fi
}
finish_interpolation_tag() {
log trace 'finish interpolation tag'
TAG_type='interpolation'
TAG_next_argument_redgect=1
buf=$(cat "$STAGE_BUFFER_1")
yq -o j -i "$AST_key += [{
\"type\": \"interpolation\",
\"path\": \"$(json_escape "$buf")\"
}]" "$AST"
}
# finds open pattern and stores the char to the STAGE_BUFFER_1
find_open_pattern() {
local char="${1:?}"
if [ ! "${open_tag_flag+x}" ] && [ "$char" = '{' ]; then
open_tag_flag=1
elif [ "${open_tag_flag+x}" ]; then
unset open_tag_flag
if [ "$char" = '[' ]; then
return 0
else
printf '{%s' "$char" >> "$CURRENT_STAGE_BUFFER"
fi
else
printf '%s' "$char" >> "$CURRENT_STAGE_BUFFER"
fi
return 1
}
parse() {
char="$1"
case "$STAGE" in
# Text Stage - save char in STAGE_BUFFER_1 until next tag opens
0)
if find_open_pattern "$char"; then
log trace "open pattern founded"
buf=$(cat "$CURRENT_STAGE_BUFFER")
# NOTE: Only add text element if buffer is not empty
if [ -n "$buf" ]; then
yq -o j -i "$AST_key += [{
\"type\": \"text\",
\"value\": \"$(json_escape "$buf")\"
}]" "$AST"
fi
buf_reset
STAGE=1
fi
;;
1)
if parse_tag "$char"; then
log_buffers
# zero-initialization
unset TAG_seen_first_ws TAG_in_ws_run TAG_pending_close TAG_type TAG_next_argument_redgect TAG_grammar_mode TAG_in_quoted_string TAG_dote
buf_reset
STAGE=0
fi
;;
2)
;;
3)
;;
4)
;;
*)
log error "error: ${WHITE}impossible stage"
exit 13
;;
esac
}
if [ -z "${AS_LIBRARY+x}" ]; then
log notice "running"
AST=$(mktemp)
yq -o j -i "[]" "$AST"
AST_key='.'
trap 'rm -f "$AST"' EXIT INT HUP
yq -o j -i '.' "$AST"
log debug "AST path: ${WHITE}${AST}"
# 0 - text
# 1 - deside tag type
# 2 - interpolation
# 3 - section
# 4 - include
# 5 - compute
STAGE=0
STAGE_BUFFER_1="$(mktemp)"
CURRENT_STAGE_BUFFER=$STAGE_BUFFER_1
trap 'rm -f "$STAGE_BUFFER_1"' EXIT INT HUP
log debug "stage buffer 1: ${WHITE}$STAGE_BUFFER_1"
while [ $# -gt 0 ]; do
case $1 in
-c|--compact-output)
OUTPUT_ARGS="${OUTPUT_ARGS+$OUTPUT_ARGS }-I=0"
shift
;;
--*|-*)
log error "argument $1 does not exists"
exit 9
;;
*)
log error "subcommand $1 does not exists"
exit 9
;;
esac
done
CHAR_N=1
LINE_N=1
while :; do
hex="$(dd bs=1 count=1 2>/dev/null | od -An -t u1)"
[ -z "$hex" ] && {
break
}
# shellcheck disable=SC2059
char="$(printf "\\$(printf '%03o' "$hex")")"
# NOTE: if $char is empty, it because `dd` returned '\n' but `$(...)`
# removed it as trailing '\n', so I set $char as '\n' here
[ -z "$char" ] && {
LINE_N=$((LINE_N+1))
char='
'
}
log trace "char: $WHITE$char"
parse "${char:?}"
CHAR_N=$((CHAR_N+1))
done
log debug 'finishing'
# finish TEXT tag if file ends on it
if [ "$STAGE" -eq 0 ]; then
if [ "${open_tag_flag+x}" ]; then
unset open_tag_flag
printf '{' >> "$STAGE_BUFFER_1"
fi
buf=$(cat "$STAGE_BUFFER_1")
# Only add text element if buffer is not empty
if [ -n "$buf" ]; then
yq -o j -i "$AST_key += [{
\"type\": \"text\",
\"value\": \"$(json_escape "$buf")\"
}]" "$AST"
fi
fi
# return the output
# shellcheck disable=SC2086
yq ${OUTPUT_ARGS:-} -o j "$AST"
fi