Files
util.nix/package/hemar/parser/hemar.sh

510 lines
11 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/bin/dash
log notice "running"
# segmented-path
# segment
# Syntax scheme:
#
# hemar
# elements
#
# elements
# element
# element elements
#
# element
# tag
# text
#
# text
# text-item
# text-item text
#
# text-item
# '0020' . '10FFFF' - '{'
# nopatern
#
# tag
# '{[' ws path ws ']}'
# '{[' ws for ws ']}'
# '{[' ws "done" ws ']}'
# '{[' ws '{[' ws ']}'
#
# # loop tag
# for
# "for" ws string ws "in" ws path
#
# # path
# path
# '.'
# segmented-path
#
# segmented-path
# segment
# segment '.' segmented-path
#
# segment
# string
# index
#
# index
# '[' digit ']'
# '[' onenine digits ']'
# '[' '-' onenine ']'
# '[' '-' onenine digits ']'
#
# # types
# string
# unquoted-string
# quoted-string
#
# unquoted-string
# unquoted-character
# unquoted-character quoted-string
#
# unquoted-character
# '0020' . '10FFFF' - '"' - '\' - '.' - '[' - ']' - '{' - '}'
#
# quoted-string
# unquoted-character
# unquoted-character string
#
# quoted-character
# '0000' . '10FFFF' - '"'
# '"' '"'
#
# digits
# digit
# digit digits
#
# digit
# '0'
# onenine
#
# onenine
# '1' . '9'
#
# # paterns
# ws
# ''
# '\x20' ws
# '\x0a' ws
# '\x0d' ws
# '\x09' ws
#
# nopatern
# '{' '0020' . '10FFFF' - '['
# AST Plex:
#
# Type = 0..=5
#
# Text = string # just a text body
#
# Interpolation = string # path to variable
#
# Section = {
# v = string # item variable name for loop
# p = string # path to array for iteration
# b = [Element] # section body
#
# }
#
# End = null
#
# Compute = {
# l = string # programing language
# b = string # function body
# }
#
# Element = {
# t = Type # element type
# b = Text # element body
# | Interpolation
# | Section
# | End
# | Include
# | Compute
# }
#
# AbstarctSyntaxTree (ATS) = {
# e = [Element] # elements array
# }
AST=$(mktemp)
AST_key='.'
trap 'rm -f "$AST"' EXIT INT HUP
yq -o j -i '.' "$AST"
log debug "AST path: ${WHITE}${AST}"
# 0 - text
# 1 - deside tag type
# 2 - interpolation
# 3 - section
# 4 - include
# 5 - compute
STAGE=0
# is_ws(char) -> bool
is_ws() {
ord=$(printf '%d' "'$1")
case $ord in
32|10|13|9) # <-> \x20 | \x0a | \x0d | \x09 <-> space | \n | \r | \t
return 0
;;
esac
return 1
}
log_buffers() {
log debug "buff 1: $WHITE$(cat "$STAGE_BUFFER_1")"
}
# remove_last_double_quote(text) -> text
remove_last_double_quote() {
printf '%s' "$1" | sed 's/\(.*\)"\(.*\)/\1\2/'
}
#buf_read(buf?) -> text
buf_read() {
local buf
if [ ${1+x} ]; then
buf=${1}
else
buf=${CURRENT_STAGE_BUFFER}
fi
cat "$buf"
}
buf_reset() {
: > "$STAGE_BUFFER_1"
CURRENT_STAGE_BUFFER="$STAGE_BUFFER_1"
}
STAGE_BUFFER_1="$(mktemp)"
CURRENT_STAGE_BUFFER=$STAGE_BUFFER_1
trap 'rm -f "$STAGE_BUFFER_1"' EXIT INT HUP
log debug "stage buffer 1: ${WHITE}$STAGE_BUFFER_1"
# json_escape(value) -> str
json_escape() {
# TODO: escape functionality
printf '%s' "${1}" | sed 's/"/\\"/g'
}
# finds close pattern and store the char to the stage buffers separating by spaces
parse_tag() {
local char="${1:?}"
# NOTE: any return 1 - skip char, regular_char + return 1 - write char
# TAG_seen_first_ws - we've already handled the first whitespace after `{[...]`
# TAG_in_ws_run - were currently in a run of whitespace chars
# TAG_pending_close - we saw `]` and are checking if the next char is `}`
string_grammar() {
if [ "${TAG_in_quoted_string+x}" ]; then
if [ "${TAG_end_quote_pending+x}" ]; then
case "$char" in
'"')
unset TAG_end_quote_pending
;;
'.')
TAG_dote=1
return 1
;;
*) log error "unexpected end of quote on $WHITE$LINE_N$NC:$WHITE$CHAR_N" ;;
esac
elif [ "$char" = '"' ]; then
TAG_end_quote_pending=1
return 1
fi
else
# shellcheck disable=SC1003
case "$char" in
'['|']'|'{'|'}'|'"'|'\')
log error "not allowed character $WHITE$char$NC on $WHITE$LINE_N$NC:$WHITE$CHAR_N"
log error "try to use quoted string"
;;
'.')
TAG_dote=1
return 1
;;
esac
fi
}
write_char() {
[ ${TAG_next_argument_redgect+x} ] && {
log error "too many argument for tag type $WHITE${TAG_type:?}$NC on $WHITE$LINE_N$NC:$WHITE$CHAR_N$NC";
exit 1;
}
[ ${TAG_in_ws_run+x} ] && {
unset TAG_in_ws_run
if [ "${TAG_seen_first_ws+x}" ]; then
case "${TAG_type:-unknown}" in
unknown) finalize_first_arg ;;
for)
# NOTE:
# grammar: for i in key."subkey" ; so we know
# 1st argument after `for` - string (name of variable)
# 2nd - 'in' (just keyword)
# 3rd - path (path to array in Model)
case ${TAG_grammar_mode:-1} in
string)
;;
kw_in)
;;
path)
;;
esac
;;
*) log panic 'unexpected TAG_type'; exit 13; ;;
esac
# NOTE: prepare to next argument
buf_reset
else
TAG_seen_first_ws=1
fi
}
printf '%s' "$1" >> "$CURRENT_STAGE_BUFFER"
}
if [ ! "${TAG_pending_close+x}" ] && [ "$char" = ']' ]; then
TAG_pending_close=1
# NOTE: skip ']' but remember to check next char for a possible '}'
return 1
elif [ "${TAG_pending_close+x}" ]; then
unset TAG_pending_close
if [ "$char" = '}' ]; then
# NOTE: found `]}` — finish bracket parsing
return 0
else
# NOTE: `]` was not followed by `}`, so emit the `]` we skipped
printf ']' >> "$CURRENT_STAGE_BUFFER"
fi
else
is_ws "$char" && { TAG_in_ws_run=1; return 1; }
fi
case "${TAG_grammar_mode:-unknown}" in
unknown)
# NOTE: we always know grammar mode but first argument
# just regular parse as string or as path if seen unquoted '.'
# NOTE: this is after char's checked on ws
# so if TAG_in_ws_run exists then this is first char in argument (just after ws)
[ "${TAG_dote+x}" ] && { log panic "TAG_dote true in unknown TAG_grammar_mode"; exit 13; }
if [ "${TAG_in_ws_run+x}" ] && [ "$char" = '"' ]; then
[ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true right after ws"; exit 13; }
TAG_in_quoted_string=1
return 1
fi
string_grammar || return 1
if [ ${TAG_dote+x} ]; then
TAG_grammar_mode=path
fi
;;
path)
if [ "${TAG_dote+x}" ]; then
log notice "suka"
fi
if [ "${TAG_in_ws_run+x}" ] && [ "$char" = '"' ] || [ "${TAG_dote+x}" ] && [ "$char" = '"' ]; then
[ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true right after ws"; exit 13; }
unset TAG_dote
TAG_in_quoted_string=1
return 1
fi
[ "${TAG_dote+x}" ] && unset TAG_dote
string_grammar || return 1
;;
string)
if [ "${TAG_in_ws_run+x}" ] && [ "$char" = '"' ]; then
[ "${TAG_in_quoted_string+x}" ] && { log panic "TAG_in_quoted_string already true right after ws"; exit 13; }
TAG_in_quoted_string=1
return 1
fi
string_grammar || return 1
if [ ${TAG_dote+x} ]; then
log error ". not allowed, use quote to escape it; on $WHITE$LINE_N$NC:$WHITE$CHAR_N$NC"
fi
;;
kw_in)
;;
*) log panic 'unexpected TAG_grammar_mode'; exit 13; ;;
esac
write_char "$char"
return 1
}
finalize_first_arg() {
case "$(cat "$CURRENT_STAGE_BUFFER")" in
for)
TAG_type='for'
# NOTE: we know that next argument after `for` is string
TAG_grammar_mode=string
log error 'for unimplemented'
exit 13
;;
done)
TAG_type='done'
TAG_next_argument_redgect=1
# NOTE: Do not save {[ done ]} to the AST becouse it is useless there
;;
'{[')
TAG_type='actual bracket'
TAG_next_argument_redgect=1
if yq -e "${AST_key}[-1].type == \"text\"" "$AST" > /dev/null; then
yq -o j -i "${AST_key}[-1].value += \"{[\"" "$AST"
else
yq -o j -i "$AST_key += [{
\"type\": \"text\",
\"value\": \"{[\"
}]" "$AST"
fi
;;
*) # interpolation tag
TAG_type='interpolation'
TAG_next_argument_redgect=1
buf=$(cat "$STAGE_BUFFER_1")
yq -o j -i "$AST_key += [{
\"type\": \"interpolation\",
\"path\": \"$(json_escape "$buf")\"
}]" "$AST"
;;
esac
}
# finds open pattern and stores the char to the STAGE_BUFFER_1
find_open_pattern() {
local char="${1:?}"
if [ ! "${open_tag_flag+x}" ] && [ "$char" = '{' ]; then
open_tag_flag=1
elif [ "${open_tag_flag+x}" ]; then
unset open_tag_flag
if [ "$char" = '[' ]; then
return 0
else
printf '{%s' "$char" >> "$CURRENT_STAGE_BUFFER"
fi
else
printf '%s' "$char" >> "$CURRENT_STAGE_BUFFER"
fi
return 1
}
parse() {
char="$1"
case "$STAGE" in
# Text Stage - save char in STAGE_BUFFER_1 until next tag opens
0)
if find_open_pattern "$char"; then
log debug "open pattern founded"
buf=$(cat "$CURRENT_STAGE_BUFFER")
yq -o j -i "$AST_key += [{
\"type\": \"text\",
\"value\": \"$(json_escape "$buf")\"
}]" "$AST"
buf_reset
STAGE=1
fi
;;
1)
if parse_tag "$char"; then
log_buffers
# zero-initialization
buf_reset
STAGE=1
fi
;;
2)
;;
3)
;;
4)
;;
*)
log error "error: ${WHITE}impossible stage"
exit 13
;;
esac
}
while [ $# -gt 0 ]; do
case $1 in
-c|--compact-output)
OUTPUT_ARGS="${OUTPUT_ARGS+$OUTPUT_ARGS }-I=0"
shift
;;
--*|-*)
log error "argument $1 does not exists"
exit 9
;;
*)
log error "subcommand $1 does not exists"
exit 9
;;
esac
done
CHAR_N=1
LINE_N=1
#LINE_NUMBER=1
while :; do
# read exactly 1 byte; preserve newlines
if ! char="$(dd bs=1 count=1 2>/dev/null)"; then
break
fi
# NOTE: if $char is empty, it because `dd` returned '\n' but `$(...)`
# removed it as trailing '\n', so I set $char as '\n' here
[ -z "$char" ] && {
LINE_N=$((LINE_N+1))
char='
'
}
parse "${char:?}"
CHAR_N=$((CHAR_N+1))
done
# finish TEXT tag if file ends on it
if [ "$STAGE" -eq 0 ]; then
if [ "${open_tag_flag+x}" ]; then
unset open_tag_flag
printf '{' >> "$STAGE_BUFFER_1"
fi
buf=$(cat "$STAGE_BUFFER_1")
yq -o j -i "$AST_key += [{
\"type\": \"text\",
\"value\": \"$(json_escape "$buf")\"
}]" "$AST"
fi
# return the output
# shellcheck disable=SC2086
yq ${OUTPUT_ARGS:-} -o j "$AST"