1#include <limits.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5
6#include <myaw.h>
7#include <pwlib/parsers.h>
8
9#define DEFAULT_LINE_CAPACITY 250
10
11#ifdef TRACE_ENABLED
12 static unsigned tracelevel = 0;
13
14# define _TRACE_INDENT() \
15 for (unsigned i = 0; i < tracelevel * 4; i++) { \
16 fputc(' ', stderr); \
17 }
18
19# define _TRACE_POS() \
20 _TRACE_INDENT() \
21 fprintf(stderr, "%s; line %u, block indent %u", \
22 __func__, parser->line_number, parser->block_indent);
23
24# define TRACE_ENTER() \
25 do { \
26 _TRACE_POS() \
27 fputs(" {\n", stderr); \
28 tracelevel++; \
29 } while (false)
30
31# define TRACE_EXIT() \
32 do { \
33 tracelevel--; \
34 _TRACE_INDENT() \
35 fputs("}\n", stderr); \
36 } while (false)
37
38# define TRACEPOINT() \
39 do { \
40 _TRACE_POS() \
41 fputc('\n', stderr); \
42 } while (false)
43
44# define TRACE(...) \
45 do { \
46 _TRACE_INDENT() \
47 fprintf(stderr, "%s: ", __func__); \
48 fprintf(stderr, __VA_ARGS__); \
49 fputc('\n', stderr); \
50 } while (false)
51#else
52# define TRACEPOINT()
53# define TRACE_ENTER()
54# define TRACE_EXIT()
55# define TRACE(...)
56#endif
57
58// forward declarations
59[[nodiscard]] static bool parse_value(MwParser* parser, unsigned* nested_value_pos, PwValuePtr convspec, PwValuePtr result);
60[[nodiscard]] static bool value_parser_func(MwParser* parser, PwValuePtr result);
61[[nodiscard]] static bool parse_raw_value(MwParser* parser, PwValuePtr result);
62[[nodiscard]] static bool parse_literal_string(MwParser* parser, PwValuePtr result);
63[[nodiscard]] static bool parse_folded_string(MwParser* parser, PwValuePtr result);
64[[nodiscard]] static bool parse_datetime(MwParser* parser, PwValuePtr result);
65[[nodiscard]] static bool parse_timestamp(MwParser* parser, PwValuePtr result);
66
67static char32_t number_terminators[] = { MW_COMMENT, ':', 0 };
68
69
70MwParser* mw_create_parser(PwValuePtr markup)
71{
72 MwParser* parser = allocate(sizeof(MwParser), true);
73 if (!parser) {
74 pw_set_status(PwStatus(PW_ERROR_OOM));
75 return nullptr;
76 }
77 parser->markup = pw_clone(markup);
78
79 parser->blocklevel = 1;
80 parser->max_blocklevel = MW_MAX_RECURSION_DEPTH;
81
82 parser->json_depth = 1;
83 parser->max_json_depth = MW_MAX_RECURSION_DEPTH;
84
85 parser->skip_comments = true;
86
87 if (!pw_create_empty_string(DEFAULT_LINE_CAPACITY, 1, &parser->current_line)) {
88 goto error;
89 }
90 if (!pw_map_va(&parser->custom_parsers,
91 PwString("raw"), PwPtr((void*) parse_raw_value),
92 PwString("literal"), PwPtr((void*) parse_literal_string),
93 PwString("folded"), PwPtr((void*) parse_folded_string),
94 PwString("datetime"), PwPtr((void*) parse_datetime),
95 PwString("timestamp"), PwPtr((void*) parse_timestamp),
96 PwString("json"), PwPtr((void*) _mw_json_parser_func)
97 )) {
98 goto error;
99 }
100 if (!pw_start_read_lines(markup)) {
101 goto error;
102 }
103 return parser;
104
105error:
106 mw_delete_parser(&parser);
107 return nullptr;
108}
109
110void mw_delete_parser(MwParser** parser_ptr)
111{
112 MwParser* parser = *parser_ptr;
113 *parser_ptr = nullptr;
114 pw_destroy(&parser->markup);
115 pw_destroy(&parser->current_line);
116 pw_destroy(&parser->custom_parsers);
117 release((void**) &parser, sizeof(MwParser));
118}
119
120[[nodiscard]] bool mw_set_custom_parser(MwParser* parser, char* convspec, MwBlockParserFunc parser_func)
121{
122 PwValue key = PwStaticString(convspec);
123 PwValue value = PwPtr((void*) parser_func);
124 return pw_map_update(&parser->custom_parsers, &key, &value);
125}
126
127static inline bool have_custom_parser(MwParser* parser, PwValuePtr convspec)
128{
129 return pw_map_has_key(&parser->custom_parsers, convspec);
130}
131
132static inline MwBlockParserFunc get_custom_parser(MwParser* parser, PwValuePtr convspec)
133{
134 PwValue parser_func = PW_NULL;
135 if (!pw_map_get(&parser->custom_parsers, convspec, &parser_func)) {
136 return nullptr;
137 }
138 return (MwBlockParserFunc) (parser_func.func_ptr);
139}
140
141bool _mw_end_of_block()
142{
143 return (current_task->status.type_id == PwTypeId_Status)
144 && (current_task->status.status_code == MW_END_OF_BLOCK);
145}
146
147static inline bool end_of_line(PwValuePtr str, unsigned position)
148/*
149 * Return true if position is beyond end of line.
150 */
151{
152 return !pw_string_index_valid(str, position);
153}
154
155static inline bool isspace_or_eol_at(PwValuePtr str, unsigned position)
156{
157 if (end_of_line(str, position)) {
158 return true;
159 } else {
160 return pw_isspace(pw_char_at(str, position));
161 }
162}
163
164[[nodiscard]] static bool read_line(MwParser* parser)
165/*
166 * Read line into parser->current line and strip trailing spaces.
167 * Return status.
168 */
169{
170 if (!pw_read_line_inplace(&parser->markup, &parser->current_line)) {
171 return false;
172 }
173 // strip trailing spaces
174 if (!pw_string_rtrim(&parser->current_line)) {
175 return false;
176 }
177 // measure indent
178 parser->current_indent = pw_string_skip_spaces(&parser->current_line, 0);
179
180 // set current_line
181 parser->line_number = pw_get_line_number(&parser->markup);
182
183 return true;
184}
185
186static inline bool is_comment_line(MwParser* parser)
187/*
188 * Return true if current line starts with MW_COMMENT char.
189 */
190{
191 return pw_char_at(&parser->current_line, parser->current_indent) == MW_COMMENT;
192}
193
194[[nodiscard]] bool _mw_read_block_line(MwParser* parser)
195{
196 TRACEPOINT();
197
198 if (parser->eof) {
199 if (parser->blocklevel) {
200 // continue returning this for nested blocks
201 pw_set_status(PwStatus(MW_END_OF_BLOCK));
202 } else {
203 pw_set_status(PwStatus(PW_ERROR_EOF));
204 }
205 return false;
206 }
207 for (;;) {{
208 if (!read_line(parser)) {
209 if (pw_is_eof()) {
210 parser->eof = true;
211 pw_destroy(&parser->current_line);
212 pw_set_status(PwStatus(MW_END_OF_BLOCK));
213 }
214 return false;
215 }
216 if (parser->skip_comments) {
217 // skip empty lines too
218 if (pw_strlen(&parser->current_line) == 0) {
219 continue;
220 }
221 if (is_comment_line(parser)) {
222 continue;
223 }
224 parser->skip_comments = false;
225 }
226 if (pw_strlen(&parser->current_line) == 0) {
227 // return empty line as is
228 return true;
229 }
230 if (parser->current_indent >= parser->block_indent) {
231 // indentation is okay, return line
232 return true;
233 }
234 // unindent detected
235 if (is_comment_line(parser)) {
236 // skip unindented comments
237 continue;
238 }
239 TRACE("unindent");
240 // end of block
241 if (!pw_unread_line(&parser->markup, &parser->current_line)) {
242 pw_set_status(PwStatus(PW_ERROR_UNREAD_FAILED));
243 return false;
244 }
245 if (!pw_string_truncate(&parser->current_line, 0)) {
246 return false;
247 }
248 pw_set_status(PwStatus(MW_END_OF_BLOCK));
249 return false;
250 }}
251}
252
253[[nodiscard]] bool _mw_read_block(MwParser* parser, PwValuePtr result)
254{
255 TRACEPOINT();
256
257 if (!pw_create(PwTypeId_BasicArray, result)) {
258 return false;
259 }
260 for (;;) {{
261 // append line
262 PwValue line = PW_NULL;
263 if (!pw_substr(&parser->current_line, parser->block_indent, UINT_MAX, &line)) {
264 return false;
265 }
266 if (!pw_array_append(result, &line)){
267 return false;
268 }
269 // read next line
270 if (!_mw_read_block_line(parser)) {
271 if (_mw_end_of_block()) {
272 return true;
273 }
274 return false;
275 }
276 }}
277}
278
279[[nodiscard]] static bool parse_nested_block(MwParser* parser, unsigned block_pos,
280 MwBlockParserFunc parser_func, PwValuePtr result)
281/*
282 * Set block indent to `block_pos` and call parser_func.
283 */
284{
285 if (parser->blocklevel >= parser->max_blocklevel) {
286 pw_set_status(mw_parser_error(parser, parser->current_indent, "Too many nested blocks"));
287 return false;
288 }
289
290 // start nested block
291 parser->blocklevel++;
292 unsigned saved_block_indent = parser->block_indent;
293 parser->block_indent = block_pos;
294
295 TRACE_ENTER();
296
297 // call parser function
298 bool ret = parser_func(parser, result);
299
300 // end nested block
301 parser->block_indent = saved_block_indent;
302 parser->blocklevel--;
303
304 TRACE_EXIT();
305 return ret;
306}
307
308[[nodiscard]] static bool parse_nested_block_from_next_line(MwParser* parser,
309 MwBlockParserFunc parser_func, PwValuePtr result)
310/*
311 * Read next line, set block indent to current indent plus one, and call parser_func.
312 */
313{
314 TRACEPOINT();
315 TRACE("new block_pos %u", parser->block_indent + 1);
316
317 // temporarily increment block indent by one and read next line
318 parser->block_indent++;
319 parser->skip_comments = true;
320 bool ret = _mw_read_block_line(parser);
321 parser->block_indent--;
322
323 if (!ret) {
324 if (_mw_end_of_block()) {
325 pw_set_status(mw_parser_error(parser, parser->current_indent, "Empty block"));
326 }
327 return false;
328 }
329
330 // call parse_nested_block
331 return parse_nested_block(parser, parser->block_indent + 1, parser_func, result);
332}
333
334unsigned _mw_get_start_position(MwParser* parser)
335{
336 if (parser->block_indent < parser->current_indent) {
337 return parser->current_indent;
338 } else {
339 return pw_string_skip_spaces(&parser->current_line, parser->block_indent);
340 }
341}
342
343bool _mw_comment_or_end_of_line(MwParser* parser, unsigned position)
344{
345 position = pw_string_skip_spaces(&parser->current_line, position);
346 return (end_of_line(&parser->current_line, position)
347 || pw_char_at(&parser->current_line, position) == MW_COMMENT);
348}
349
350[[nodiscard]] static bool parse_convspec(MwParser* parser, unsigned opening_colon_pos,
351 unsigned* end_pos, PwValuePtr result)
352/*
353 * Extract conversion specifier starting from `opening_colon_pos` in the `current_line`.
354 *
355 * On success return string and write `end_pos`.
356 *
357 * If conversion specified is not detected, return PwNull()
358 */
359{
360 // make result Null
361 pw_destroy(result);
362
363 PwValuePtr current_line = &parser->current_line;
364
365 unsigned start_pos = opening_colon_pos + 1;
366 unsigned closing_colon_pos;
367 if (!pw_strchr(current_line, ':', start_pos, &closing_colon_pos)) {
368 return true;
369 }
370 if (closing_colon_pos == start_pos) {
371 // empty conversion specifier
372 return true;
373 }
374 if (!isspace_or_eol_at(current_line, closing_colon_pos + 1)) {
375 // not a conversion specifier
376 return true;
377 }
378 PwValue convspec = PW_NULL;
379 if (!pw_substr(current_line, start_pos, closing_colon_pos, &convspec)) {
380 return false;
381 }
382 if (!pw_string_trim(&convspec)) {
383 return false;
384 }
385 if (!have_custom_parser(parser, &convspec)) {
386 // such a conversion specifier is not defined
387 return true;
388 }
389 *end_pos = closing_colon_pos + 1;
390 pw_move(&convspec, result);
391 return true;
392}
393
394[[nodiscard]] static bool parse_raw_value(MwParser* parser, PwValuePtr result)
395{
396 TRACEPOINT();
397
398 PwValue lines = PW_NULL;
399 if (!_mw_read_block(parser, &lines)) {
400 return false;
401 }
402 if (pw_array_length(&lines) > 1) {
403 // append one empty line for ending line break
404 PwValue empty_line = PW_STRING("");
405 if (!pw_array_append(&lines, &empty_line)) {
406 return false;
407 }
408 }
409 // return concatenated lines
410 return pw_array_join(&lines, '\n', result);
411}
412
413[[nodiscard]] static bool parse_literal_string(MwParser* parser, PwValuePtr result)
414/*
415 * Parse current block as a literal string.
416 */
417{
418 TRACEPOINT();
419
420 PwValue lines = PW_NULL;
421 if (!_mw_read_block(parser, &lines)) {
422 return false;
423 }
424
425 // normalize list of lines
426
427 if (!pw_array_dedent(&lines)) {
428 return false;
429 }
430
431 // drop empty trailing lines
432 unsigned len = pw_array_length(&lines);
433 while (len--) {{
434 PwValue line = PW_NULL;
435 if (!pw_array_item(&lines, len, &line)) {
436 return false;
437 }
438 if (pw_strlen(&line) != 0) {
439 break;
440 }
441 if (!pw_array_del(&lines, len, len + 1)) {
442 return false;
443 }
444 }}
445
446 // append one empty line for ending line break
447 if (pw_array_length(&lines) > 1) {
448 PwValue empty_line = PW_STRING("");
449 if (!pw_array_append(&lines, &empty_line)) {
450 return false;
451 }
452 }
453
454 // return concatenated lines
455 return pw_array_join(&lines, '\n', result);
456}
457
458[[nodiscard]] bool _mw_unescape_line(MwParser* parser, PwValuePtr line, unsigned line_number,
459 char32_t quote, unsigned start_pos, unsigned end_pos, PwValuePtr result)
460{
461 if (!pw_create_empty_string(end_pos - start_pos, // unescaped string can be shorter
462 line->char_size, result)) {
463 return false;
464 }
465 unsigned pos = start_pos;
466 while (pos < end_pos) {
467 char32_t chr = pw_char_at(line, pos);
468 if (chr == quote) {
469 // closing quotation mark detected
470 break;
471 }
472 if (chr != '\\') {
473 if (!pw_string_append(result, chr)) {
474 return false;
475 }
476 } else {
477 // start of escape sequence
478 pos++;
479 if (pos >= end_pos) {
480 if (!pw_string_append(result, chr)) { // leave backslash in the result
481 return false;
482 }
483 }
484 int hexlen;
485 chr = pw_char_at(line, pos);
486 switch (chr) {
487
488 // Simple escape sequences
489 case '\'': // \' single quote byte 0x27
490 case '"': // \" double quote byte 0x22
491 case '?': // \? question mark byte 0x3f
492 case '\\': // \\ backslash byte 0x5c
493 if (!pw_string_append(result, chr)) { return false; };
494 break;
495 case 'a': if (!pw_string_append(result, 0x07)) { return false; } break; // audible bell
496 case 'b': if (!pw_string_append(result, 0x08)) { return false; } break; // backspace
497 case 'f': if (!pw_string_append(result, 0x0c)) { return false; } break; // form feed
498 case 'n': if (!pw_string_append(result, 0x0a)) { return false; } break; // line feed
499 case 'r': if (!pw_string_append(result, 0x0d)) { return false; } break; // carriage return
500 case 't': if (!pw_string_append(result, 0x09)) { return false; } break; // horizontal tab
501 case 'v': if (!pw_string_append(result, 0x0b)) { return false; } break; // vertical tab
502
503 // Numeric escape sequences
504 case 'o': {
505 // \on{1:3} code unit n... (1-3 octal digits)
506 char32_t v = 0;
507 for (int i = 0; i < 3; i++) {
508 pos++;
509 if (pos >= end_pos) {
510 if (i == 0) {
511 pw_set_status(mw_parser_error2(parser, line_number, pos, "Incomplete octal value"));
512 return false;
513 }
514 break;
515 }
516 char32_t c = pw_char_at(line, pos);
517 if ('0' <= c && c <= '7') {
518 v <<= 3;
519 v += c - '0';
520 } else {
521 pw_set_status(mw_parser_error2(parser, line_number, pos, "Bad octal value"));
522 return false;
523 }
524 }
525 if (!pw_string_append(result, v)) { return false; }
526 break;
527 }
528 case 'x':
529 // \xn{2} code unit n... (exactly 2 hexadecimal digits are required)
530 hexlen = 2;
531 goto parse_hex_value;
532
533 // Unicode escape sequences
534 case 'u':
535 // \un{4} code point U+n... (exactly 4 hexadecimal digits are required)
536 hexlen = 4;
537 goto parse_hex_value;
538 case 'U':
539 // \Un{8} code point U+n... (exactly 8 hexadecimal digits are required)
540 hexlen = 8;
541
542 parse_hex_value: {
543 char32_t v = 0;
544 for (int i = 0; i < hexlen; i++) {
545 pos++;
546 if (pos >= end_pos) {
547 pw_set_status(mw_parser_error2(parser, line_number, pos, "Incomplete hexadecimal value"));
548 return false;
549 }
550 char32_t c = pw_char_at(line, pos);
551 if ('0' <= c && c <= '9') {
552 v <<= 4;
553 v += c - '0';
554 } else if ('a' <= c && c <= 'f') {
555 v <<= 4;
556 v += c - 'a' + 10;
557 } else if ('A' <= c && c <= 'F') {
558 v <<= 4;
559 v += c - 'A' + 10;
560 } else {
561 pw_set_status(mw_parser_error2(parser, line_number, pos, "Bad hexadecimal value"));
562 return false;
563 }
564 }
565 if (!pw_string_append(result, v)) { return false; }
566 break;
567 }
568 default:
569 // not a valid escape sequence
570 if (!pw_string_append(result, '\\')) { return false; }
571 if (!pw_string_append(result, chr)) { return false; }
572 break;
573 }
574 }
575 pos++;
576 }
577 return true;
578}
579
580[[nodiscard]] static bool fold_lines(MwParser* parser, PwValuePtr lines,
581 char32_t quote, PwValuePtr line_numbers, PwValuePtr result)
582/*
583 * Fold list of lines and return concatenated string.
584 *
585 * If `quote` is nonzero, unescape lines.
586 */
587{
588 if (!pw_array_dedent(lines)) {
589 return false;
590 }
591 unsigned len = pw_array_length(lines);
592
593 // skip leading empty lines
594 unsigned start_i = 0;
595 for (; start_i < len; start_i++) {{
596 PwValue line = PW_NULL;
597 if (!pw_array_item(lines, start_i, &line)) {
598 return false;
599 }
600 if (pw_strlen(&line) != 0 && !pw_string_isspace(&line)) {
601 break;
602 }
603 }}
604 if (start_i == len) {
605 // return empty string
606 pw_destroy(result);
607 *result = PwString("");
608 return true;
609 }
610
611 // skip trailing empty lines
612 unsigned end_i = len;
613 for (; end_i; end_i--) {{
614 PwValue line = PW_NULL;
615 if (!pw_array_item(lines, end_i - 1, &line)) {
616 return false;
617 }
618 if (pw_strlen(&line) != 0 && !pw_string_isspace(&line)) {
619 break;
620 }
621 }}
622 if (end_i == 0) {
623 // return empty string
624 pw_destroy(result);
625 *result = PwString("");
626 return true;
627 }
628
629 // calculate length of result
630 unsigned result_len = end_i - start_i - 1; // reserve space for separators
631 uint8_t char_size = 1;
632 for (unsigned i = start_i; i < end_i; i++) {{
633 PwValue line = PW_NULL;
634 if (!pw_array_item(lines, i, &line)) {
635 return false;
636 }
637 result_len += pw_strlen(&line);
638 uint8_t cs = line.char_size;
639 if (cs > char_size) {
640 char_size = cs;
641 }
642 }}
643
644 // allocate result
645 if (!pw_create_empty_string(result_len, char_size, result)) {
646 return false;
647 }
648
649 // concatenate lines
650 bool prev_LF = false;
651 for (unsigned i = start_i; i < end_i; i++) {{
652 PwValue line = PW_NULL;
653 if (!pw_array_item(lines, i, &line)) {
654 return false;
655 }
656 if (i > start_i) {
657 if (pw_strlen(&line) == 0) {
658 // treat empty lines as LF
659 if (!pw_string_append(result, '\n')) {
660 return false;
661 }
662 prev_LF = true;
663 } else {
664 if (prev_LF) {
665 // do not append separator if previous line was empty
666 prev_LF = false;
667 } else {
668 if (pw_isspace(pw_char_at(&line, 0))) {
669 // do not append separator if the line aleady starts with space
670 } else {
671 if (!pw_string_append(result, ' ')) {
672 return false;
673 }
674 }
675 }
676 }
677 }
678 if (quote) {
679 PwValue line_number = PW_NULL;
680 if (!pw_array_item(line_numbers, i, &line_number)) {
681 return false;
682 }
683 PwValue unescaped = PW_NULL;
684 if (!_mw_unescape_line(parser, &line, line_number.unsigned_value, quote, 0, pw_strlen(&line), &unescaped)) {
685 return false;
686 }
687 if (!pw_string_append(result, &unescaped)) {
688 return false;
689 }
690 } else {
691 if (!pw_string_append(result, &line)) {
692 return false;
693 }
694 }
695 }}
696 return true;
697}
698
699[[nodiscard]] static bool parse_folded_string(MwParser* parser, PwValuePtr result)
700{
701 TRACEPOINT();
702
703 PwValue lines = PW_NULL;
704 if (!_mw_read_block(parser, &lines)) {
705 return false;
706 }
707 return fold_lines(parser, &lines, 0, nullptr, result);
708}
709
710bool _mw_find_closing_quote(PwValuePtr line, char32_t quote, unsigned start_pos, unsigned* end_pos)
711{
712 for (;;) {
713 if (!pw_strchr(line, quote, start_pos, end_pos)) {
714 return false;
715 }
716 // check if the quotation mark is not escaped
717 if (*end_pos && pw_char_at(line, *end_pos - 1) == '\\') {
718 // continue searching
719 start_pos = *end_pos + 1;
720 } else {
721 return true;
722 }
723 }
724}
725
726[[nodiscard]] static bool parse_quoted_string(MwParser* parser, unsigned opening_quote_pos,
727 unsigned* end_pos, PwValuePtr result)
728/*
729 * Parse quoted string starting from `opening_quote_pos` in the current line.
730 *
731 * Write next position after the closing quotation mark to `end_pos`.
732 */
733{
734 TRACEPOINT();
735
736 // Get opening quote. The closing quote should be the same.
737 char32_t quote = pw_char_at(&parser->current_line, opening_quote_pos);
738
739 // process first line
740 unsigned closing_quote_pos;
741 if (_mw_find_closing_quote(&parser->current_line, quote, opening_quote_pos + 1, &closing_quote_pos)) {
742 // single-line string
743 *end_pos = closing_quote_pos + 1;
744 return _mw_unescape_line(parser, &parser->current_line, parser->line_number,
745 quote, opening_quote_pos + 1, closing_quote_pos, result);
746 }
747
748 unsigned block_indent = opening_quote_pos + 1;
749
750 // make parser read nested block
751 unsigned saved_block_indent = parser->block_indent;
752 parser->block_indent = block_indent;
753 parser->blocklevel++;
754
755 // read block
756 PwValue lines = PW_NULL;
757 if (!pw_create(PwTypeId_BasicArray, &lines)) {
758 return false;
759 }
760 PwValue line_numbers = PW_NULL;
761 if (!pw_create(PwTypeId_BasicArray, &line_numbers)) {
762 return false;
763 }
764 bool closing_quote_detected = false;
765 for (;;) {{
766 // append line number
767 PwValue n = PwUnsigned(parser->line_number);
768 if (!pw_array_append(&line_numbers, &n)) {
769 return false;
770 }
771 // append line
772 if (_mw_find_closing_quote(&parser->current_line, quote, block_indent, end_pos)) {
773 // final line
774 PwValue final_line = PW_NULL;
775 if (!pw_substr(&parser->current_line, block_indent, *end_pos, &final_line)) {
776 return false;
777 }
778 // strip trailing spaces
779 if (!pw_string_rtrim(&final_line)) {
780 return false;
781 }
782 if (!pw_array_append(&lines, &final_line)) {
783 return false;
784 }
785 (*end_pos)++;
786 closing_quote_detected = true;
787 break;
788 } else {
789 // intermediate line
790 PwValue line = PW_NULL;
791 if (!pw_substr(&parser->current_line, block_indent, UINT_MAX, &line)) {
792 return false;
793 }
794 if (!pw_array_append(&lines, &line)) {
795 return false;
796 }
797 }
798 // read next line
799 if (!_mw_read_block_line(parser)) {
800 if (_mw_end_of_block()) {
801 break;
802 }
803 return false;
804 }
805 }}
806
807 // finished reading nested block
808 parser->block_indent = saved_block_indent;
809 parser->blocklevel--;
810
811 if (!closing_quote_detected) {
812
813 static char unterminated[] = "String has no closing quote";
814
815 // the above loop terminated abnormally, need to read next line
816 if (!_mw_read_block_line(parser)) {
817 if (_mw_end_of_block()) {
818 pw_set_status(mw_parser_error(parser, parser->current_indent, unterminated));
819 }
820 return false;
821 }
822 // check if the line starts with a quote with the same indent as the opening quote
823 if (parser->current_indent == opening_quote_pos
824 && pw_char_at(&parser->current_line, parser->current_indent) == quote) {
825
826 *end_pos = opening_quote_pos + 1;
827 } else {
828 pw_set_status(mw_parser_error(parser, parser->current_indent, unterminated));
829 return false;
830 }
831 }
832
833 // fold and unescape
834 return fold_lines(parser, &lines, quote, &line_numbers, result);
835}
836
837[[nodiscard]] static bool parse_datetime(MwParser* parser, PwValuePtr result)
838/*
839 * Parse value date/time starting from block indent in the current line.
840 * Return PwDateTime on success, PwStatus on error.
841 */
842{
843 static char bad_datetime[] = "Bad date/time";
844 static char32_t allowed_terminators[] = { MW_COMMENT, 0 };
845
846 unsigned start_pos = _mw_get_start_position(parser);
847 unsigned end_pos;
848 if (!_pw_parse_datetime(&parser->current_line, start_pos, &end_pos, allowed_terminators, result)) {
849 if (current_task->status.status_code == PW_ERROR_BAD_DATETIME) {
850 pw_set_status(mw_parser_error(parser, start_pos, bad_datetime));
851 }
852 return false;
853 }
854 if (!_mw_comment_or_end_of_line(parser, end_pos)) {
855 pw_set_status(mw_parser_error(parser, start_pos, bad_datetime));
856 return false;
857 }
858 return true;
859}
860
861[[nodiscard]] static bool parse_timestamp(MwParser* parser, PwValuePtr result)
862/*
863 * Parse value as timestamp starting from block indent in the current line.
864 * Return PwTimestamp on success, PwStatus on error.
865 */
866{
867 static char bad_timestamp[] = "Bad timestamp";
868 static char32_t allowed_terminators[] = { MW_COMMENT, 0 };
869
870 unsigned start_pos = _mw_get_start_position(parser);
871 unsigned end_pos;
872 if (!_pw_parse_timestamp(&parser->current_line, start_pos, &end_pos, allowed_terminators, result)) {
873 if (current_task->status.status_code == PW_ERROR_BAD_TIMESTAMP) {
874 pw_set_status(mw_parser_error(parser, start_pos, bad_timestamp));
875 } else if (current_task->status.status_code == PW_ERROR_NUMERIC_OVERFLOW) {
876 pw_set_status(mw_parser_error(parser, start_pos, "Numeric overflow"));
877 }
878 return false;
879 }
880 if (!_mw_comment_or_end_of_line(parser, end_pos)) {
881 pw_set_status(mw_parser_error(parser, end_pos, bad_timestamp));
882 return false;
883 }
884 return true;
885}
886
887[[nodiscard]] bool _mw_parse_number(MwParser* parser, unsigned start_pos, int sign,
888 unsigned* end_pos, char32_t* allowed_terminators, PwValuePtr result)
889{
890 TRACEPOINT();
891 TRACE("start_pos %u", start_pos);
892
893 if (!_pw_parse_number(&parser->current_line, start_pos, sign, end_pos, allowed_terminators, result)) {
894 if (current_task->status.status_code == PW_ERROR_BAD_NUMBER) {
895 pw_set_status(mw_parser_error(parser, start_pos, "Bad number"));
896 } else if (current_task->status.status_code == PW_ERROR_NUMERIC_OVERFLOW) {
897 pw_set_status(mw_parser_error(parser, start_pos, "Numeric overflow"));
898 }
899 return false;
900 }
901 return true;
902}
903
904[[nodiscard]] static bool parse_list(MwParser* parser, PwValuePtr result)
905/*
906 * Parse list.
907 *
908 * Return list value on success.
909 * Return nullptr on error.
910 */
911{
912 TRACE_ENTER();
913
914 if (!pw_create(PwTypeId_BasicArray, result)) {
915 return false;
916 }
917
918 /*
919 * All list items must have the same indent.
920 * Save indent of the first item (current one) and check it for subsequent items.
921 */
922 unsigned item_indent = _mw_get_start_position(parser);
923
924 for (;;) {
925 {
926 // check if hyphen is followed by space or end of line
927 unsigned next_pos = item_indent + 1;
928 if (!isspace_or_eol_at(&parser->current_line, next_pos)) {
929 pw_set_status(mw_parser_error(parser, item_indent, "Bad list item"));
930 return false;
931 }
932
933 // parse item as a nested block
934
935 PwValue item = PW_NULL;
936 if (_mw_comment_or_end_of_line(parser, next_pos)) {
937 if (!parse_nested_block_from_next_line(parser, value_parser_func, &item)) {
938 return false;
939 }
940 } else {
941 // nested block starts on the same line, increment block position
942 next_pos++;
943 if (!parse_nested_block(parser, next_pos, value_parser_func, &item)) {
944 return false;
945 }
946 }
947 if (!pw_array_append(result, &item)) {
948 return false;
949 }
950 if (!_mw_read_block_line(parser)) {
951 if (_mw_end_of_block()) {
952 break;
953 }
954 return false;
955 }
956 if (parser->current_indent != item_indent) {
957 pw_set_status(mw_parser_error(parser, parser->current_indent, "Bad indentation of list item"));
958 return false;
959 }
960 }
961 }
962 TRACE_EXIT();
963 return true;
964}
965
966[[nodiscard]] static bool parse_map(MwParser* parser, PwValuePtr first_key,
967 PwValuePtr convspec_arg, unsigned value_pos, PwValuePtr result)
968/*
969 * Parse map.
970 *
971 * Key is already parsed, continue parsing from `value_pos` in the `current_line`.
972 *
973 * Return map value on success.
974 */
975{
976 TRACE_ENTER();
977
978 if (!pw_create(PwTypeId_BasicMap, result)) {
979 TRACE_EXIT();
980 return false;
981 }
982
983 PwValue key = pw_clone(first_key);
984 PwValue convspec = pw_clone(convspec_arg);
985
986 /*
987 * All keys in the map must have the same indent.
988 * Save indent of the first key (current one) and check it for subsequent keys.
989 */
990 unsigned key_indent = _mw_get_start_position(parser);
991
992 for (;;) {
993 TRACE("parse value (line %u) from position %u", parser->line_number, value_pos);
994 {
995 // parse value as a nested block
996
997 MwBlockParserFunc parser_func = value_parser_func;
998 if (pw_is_string(&convspec)) {
999 parser_func = get_custom_parser(parser, &convspec);
1000 }
1001 PwValue value = PW_NULL;
1002 if (_mw_comment_or_end_of_line(parser, value_pos)) {
1003 if (!parse_nested_block_from_next_line(parser, parser_func, &value)) {
1004 TRACE_EXIT();
1005 return false;
1006 }
1007 } else {
1008 if (!parse_nested_block(parser, value_pos, parser_func, &value)) {
1009 TRACE_EXIT();
1010 return false;
1011 }
1012 }
1013 if (!pw_map_update(result, &key, &value)) {
1014 TRACE_EXIT();
1015 return false;
1016 }
1017 }
1018 TRACE("parse next key");
1019 {
1020 pw_destroy(&key);
1021 pw_destroy(&convspec);
1022
1023 if (!_mw_read_block_line(parser)) {
1024 if (_mw_end_of_block()) {
1025 TRACE("end of map");
1026 break;
1027 }
1028 TRACE_EXIT();
1029 return false;
1030 }
1031 if (parser->current_indent != key_indent) {
1032 pw_set_status(mw_parser_error(parser, parser->current_indent, "Bad indentation of map key"));
1033 TRACE_EXIT();
1034 return false;
1035 }
1036 if (!parse_value(parser, &value_pos, &convspec, &key)) {
1037 TRACE_EXIT();
1038 return false;
1039 }
1040 }
1041 }
1042 TRACE_EXIT();
1043 return true;
1044}
1045
1046[[nodiscard]] static bool is_kv_separator(MwParser* parser, unsigned colon_pos,
1047 PwValuePtr convspec_out, unsigned *value_pos, bool* result)
1048/*
1049 * Set result true if colon_pos is followed by end of line, space, or conversion specifier.
1050 * Write conversion specifier to `convspec_out` if value is followed by conversion specifier.
1051 * Write position of value to value_pos.
1052 */
1053{
1054 PwValuePtr current_line = &parser->current_line;
1055
1056 unsigned next_pos = colon_pos + 1;
1057
1058 if (end_of_line(current_line, next_pos)) {
1059 *value_pos = next_pos;
1060 *result = true;
1061 return true;
1062 }
1063 char32_t chr = pw_char_at(current_line, next_pos);
1064 if (isspace(chr)) {
1065 *value_pos = next_pos + 1; // value should be separated from key by at least one space
1066 next_pos = pw_string_skip_spaces(current_line, next_pos);
1067 // cannot be end of line here because current line is R-trimmed and EOL is already checked
1068 chr = pw_char_at(current_line, next_pos);
1069 if (chr != ':') {
1070 // separator without conversion specifier
1071 *result = true;
1072 return true;
1073 }
1074 } else if (chr != ':') {
1075 // key not followed immediately by conversion specifier -> not a separator
1076 *result = false;
1077 return true;
1078 }
1079
1080 // try parsing conversion specifier
1081 // value_pos will be updated only if conversion specifier is valid
1082 PwValue convspec = PW_NULL;
1083 if (!parse_convspec(parser, next_pos, value_pos, &convspec)) {
1084 return false;
1085 }
1086 if (pw_is_string(&convspec)) {
1087 if (convspec_out) {
1088 pw_move(&convspec, convspec_out);
1089 }
1090 *result = true;
1091 return true;
1092 }
1093
1094 // bad conversion specifier -> not a separator
1095 *result = false;
1096 return true;
1097}
1098
1099[[nodiscard]] static bool check_value_end(MwParser* parser, PwValuePtr value, unsigned end_pos,
1100 unsigned* nested_value_pos, PwValuePtr convspec_out, PwValuePtr result)
1101/*
1102 * Helper function for parse_value.
1103 *
1104 * Check if value ends with key-value separator and parse map.
1105 * If not, check if end_pos points to end of line or comment.
1106 *
1107 * If `nested_value_pos` is provided, the value is _expected_ to be a map key
1108 * and _must_ end with key-value separator.
1109 *
1110 * On success return parsed value.
1111 * If `nested_value_pos' is not null, write position of the next char after colon to it
1112 * and write conversion specifier to `convspec_out` if value is followed by conversion specifier.
1113 *
1114 * Read next line if nothing to parse on the current_line.
1115 *
1116 * Return cloned value.
1117 */
1118{
1119 end_pos = pw_string_skip_spaces(&parser->current_line, end_pos);
1120 if (end_of_line(&parser->current_line, end_pos)) {
1121 if (nested_value_pos) {
1122 pw_set_status(mw_parser_error(parser, end_pos, "Map key expected"));
1123 return false;
1124 }
1125 // read next line
1126 if (!_mw_read_block_line(parser)) {
1127 if (!_mw_end_of_block()) {
1128 return false;
1129 }
1130 }
1131 pw_clone2(value, result);
1132 return true;
1133 }
1134
1135 char32_t chr = pw_char_at(&parser->current_line, end_pos);
1136 if (chr == ':') {
1137 // check key-value separator
1138 PwValue convspec = PW_NULL;
1139 unsigned value_pos;
1140 bool kvs;
1141 if (!is_kv_separator(parser, end_pos, &convspec, &value_pos, &kvs)) {
1142 return false;
1143 }
1144 if (kvs) {
1145 // found key-value separator
1146 if (nested_value_pos) {
1147 // it was anticipated, just return the value
1148 *nested_value_pos = value_pos;
1149 pw_move(&convspec, convspec_out);
1150 pw_clone2(value, result);
1151 return true;
1152 }
1153 // parse map
1154 PwValue first_key = pw_clone(value);
1155 return parse_map(parser, &first_key, &convspec, value_pos, result);
1156 }
1157 pw_set_status(mw_parser_error(parser, end_pos + 1, "Bad character encountered"));
1158 return false;
1159 }
1160
1161 if (chr != MW_COMMENT) {
1162 pw_set_status(mw_parser_error(parser, end_pos, "Bad character encountered"));
1163 return false;
1164 }
1165
1166 // read next line
1167 if (!_mw_read_block_line(parser)) {
1168 if (!_mw_end_of_block()) {
1169 return false;
1170 }
1171 }
1172 pw_clone2(value, result);
1173 return true;
1174}
1175
1176[[nodiscard]] static bool parse_value(MwParser* parser, unsigned* nested_value_pos,
1177 PwValuePtr convspec_out, PwValuePtr result)
1178/*
1179 * Parse value starting from `current_line[block_indent]` .
1180 *
1181 * If `nested_value_pos` is provided, the value is _expected_ to be a map key
1182 * and _must_ end with colon or include a colon if it's a literal strings.
1183 *
1184 * On success return parsed value.
1185 * If `nested_value_pos' is provided, write position of the next char after colon to it
1186 * and write conversion specifier to `convspec_out` if it's followed by conversion specifier.
1187 *
1188 * On error return status and set `parser->result["error"]`.
1189 */
1190{
1191 TRACEPOINT();
1192
1193 unsigned start_pos = _mw_get_start_position(parser);
1194
1195 // Analyze first character.
1196 char32_t chr = pw_char_at(&parser->current_line, start_pos);
1197
1198 // first, check if value starts with colon that may denote conversion specifier
1199
1200 if (chr == ':') {
1201 // this might be conversion specifier
1202 if (nested_value_pos) {
1203 // we expect map key, and map keys cannot start with colon
1204 // because they would look same as conversion specifier
1205 pw_set_status(mw_parser_error(parser, start_pos, "Map key expected and it cannot start with colon"));
1206 return false;
1207 }
1208 unsigned value_pos;
1209 PwValue convspec =PW_NULL;
1210 if (!parse_convspec(parser, start_pos, &value_pos, &convspec)) {;
1211 return false;
1212 }
1213 if (!pw_is_string(&convspec)) {
1214 // not a conversion specifier
1215 return parse_literal_string(parser, result);
1216 }
1217 // we have conversion specifier
1218 if (end_of_line(&parser->current_line, value_pos)) {
1219
1220 // conversion specifier is followed by LF
1221 // continue parsing CURRENT block from next line
1222 if (!_mw_read_block_line(parser)) {
1223 if (_mw_end_of_block()) {
1224 pw_set_status(mw_parser_error(parser, parser->current_indent, "Empty block"));
1225 }
1226 return false;
1227 }
1228 // call parser function
1229 MwBlockParserFunc parser_func = get_custom_parser(parser, &convspec);
1230 return parser_func(parser, result);
1231
1232 } else {
1233 // value is on the same line, parse it as nested block
1234 return parse_nested_block(
1235 parser, value_pos, get_custom_parser(parser, &convspec), result
1236 );
1237 }
1238 }
1239
1240 // other values can be map keys
1241
1242 // check for dash
1243
1244 if (chr == '-') {
1245 unsigned next_pos = start_pos + 1;
1246 char32_t next_chr = pw_char_at(&parser->current_line, next_pos);
1247
1248 // if followed by digit, it's a number
1249 if ('0' <= next_chr && next_chr <= '9') {
1250 unsigned end_pos;
1251 PwValue number = PW_NULL;
1252 if (!_mw_parse_number(parser, next_pos, -1, &end_pos, number_terminators, &number)) {
1253 return false;
1254 }
1255 return check_value_end(parser, &number, end_pos, nested_value_pos, convspec_out, result);
1256 }
1257 // if followed by space or end of line, that's a list item
1258 if (isspace_or_eol_at(&parser->current_line, next_pos)) {
1259 if (nested_value_pos) {
1260 pw_set_status(mw_parser_error(parser, start_pos, "Map key expected and it cannot be a list"));
1261 return false;
1262 }
1263 // yes, it's a list item
1264 return parse_list(parser, result);
1265 }
1266 // otherwise, it's a literal string or map
1267 goto parse_literal_string_or_map;
1268 }
1269
1270 // check for quoted string
1271
1272 if (chr == '"' || chr == '\'') {
1273 // quoted string
1274 unsigned start_line = parser->line_number;
1275 unsigned end_pos;
1276 PwValue str = PW_NULL;
1277 if (!parse_quoted_string(parser, start_pos, &end_pos, &str)) {
1278 return false;
1279 }
1280 unsigned end_line = parser->line_number;
1281 if (end_line == start_line) {
1282 // single-line string can be a map key
1283 return check_value_end(parser, &str, end_pos, nested_value_pos, convspec_out, result);
1284 } else if (_mw_comment_or_end_of_line(parser, end_pos)) {
1285 // multi-line string cannot be a key
1286 pw_move(&str, result);
1287 return true;
1288 } else {
1289 pw_set_status(mw_parser_error(parser, end_pos, "Bad character after quoted string"));
1290 return false;
1291 }
1292 }
1293
1294 // check for reserved keywords
1295
1296 TRACE("trying reserved keywords");
1297 if (pw_substring_eq(&parser->current_line, start_pos, start_pos + 4, "null")) {
1298 PwValue null_value = PW_NULL;
1299 return check_value_end(parser, &null_value, start_pos + 4, nested_value_pos, convspec_out, result);
1300 }
1301 if (pw_substring_eq(&parser->current_line, start_pos, start_pos + 4, "true")) {
1302 PwValue true_value = PW_BOOL(true);
1303 return check_value_end(parser, &true_value, start_pos + 4, nested_value_pos, convspec_out, result);
1304 }
1305 if (pw_substring_eq(&parser->current_line, start_pos, start_pos + 5, "false")) {
1306 PwValue false_value = PW_BOOL(false);
1307 return check_value_end(parser, &false_value, start_pos + 5, nested_value_pos, convspec_out, result);
1308 }
1309
1310 // try parsing number
1311
1312 TRACE("not a keyword, trying number");
1313 if (chr == '+') {
1314 char32_t next_chr = pw_char_at(&parser->current_line, start_pos + 1);
1315 if ('0' <= next_chr && next_chr <= '9') {
1316 start_pos++;
1317 chr = next_chr;
1318 }
1319 }
1320 if ('0' <= chr && chr <= '9') {
1321 unsigned end_pos;
1322 PwValue number = PW_NULL;
1323 if (!_mw_parse_number(parser, start_pos, 1, &end_pos, number_terminators, &number)) {
1324 return false;
1325 }
1326 return check_value_end(parser, &number, end_pos, nested_value_pos, convspec_out, result);
1327 }
1328 TRACE("not a number, pasring literal string or map");
1329
1330parse_literal_string_or_map:
1331
1332 // look for key-value separator
1333 for (unsigned pos = start_pos;;) {
1334 unsigned colon_pos;
1335 if (!pw_strchr(&parser->current_line, ':', pos, &colon_pos)) {
1336 break;
1337 }
1338 PwValue convspec = PW_NULL;
1339 unsigned value_pos;
1340 bool kvs;
1341 if (!is_kv_separator(parser, colon_pos, &convspec, &value_pos, &kvs)) {
1342 return false;
1343 }
1344 if (kvs) {
1345 // found key-value separator, get key
1346 PwValue key = PW_NULL;
1347 if (!pw_substr(&parser->current_line, start_pos, colon_pos, &key)) {
1348 return false;
1349 }
1350 // strip trailing spaces
1351 if (!pw_string_rtrim(&key)) {
1352 return false;
1353 }
1354 if (nested_value_pos) {
1355 // key was anticipated, simply return it
1356 *nested_value_pos = value_pos;
1357 pw_move(&convspec, convspec_out);
1358 pw_move(&key, result);
1359 return true;
1360 }
1361 // parse map
1362 return parse_map(parser, &key, &convspec, value_pos, result);
1363 }
1364 pos = colon_pos + 1;
1365 }
1366
1367 // separator not found
1368
1369 if (nested_value_pos) {
1370 // expecting key, but it's a bare literal string
1371 pw_set_status(mw_parser_error(parser, parser->current_indent, "Not a key"));
1372 return false;
1373 }
1374 return parse_literal_string(parser, result);
1375}
1376
1377[[nodiscard]] static bool value_parser_func(MwParser* parser, PwValuePtr result)
1378{
1379 return parse_value(parser, nullptr, nullptr, result);
1380}
1381
1382[[nodiscard]] bool mw_parse(PwValuePtr markup, PwValuePtr result)
1383{
1384 [[ gnu::cleanup(mw_delete_parser) ]] MwParser* parser = mw_create_parser(markup);
1385 if (!parser) {
1386 return false;
1387 }
1388 // read first line to prepare for parsing and to detect EOF
1389 if (!_mw_read_block_line(parser)) {
1390 if (_mw_end_of_block() && parser->eof) {
1391 pw_set_status(PwStatus(PW_ERROR_EOF));
1392 }
1393 return false;
1394 }
1395
1396 // parse top-level value
1397 if (!value_parser_func(parser, result)) {
1398 return false;
1399 }
1400
1401 // make sure markup has no more data
1402 if (_mw_read_block_line(parser)) {
1403 pw_set_status(mw_parser_error(parser, parser->current_indent, "Extra data after parsed value"));
1404 return false;
1405 }
1406 return parser->eof;
1407}