toolkit to make CSV into something that texttools can easily operate on
authormirabilos <t.glaser@tarent.de>
Thu, 26 Nov 2015 15:07:31 +0000 (16:07 +0100)
committermirabilos <t.glaser@tarent.de>
Thu, 26 Nov 2015 15:07:31 +0000 (16:07 +0100)
mksh/ssv/.gitignore [new file with mode: 0644]
mksh/ssv/0-README [new file with mode: 0644]
mksh/ssv/csv2ssv.c [new file with mode: 0644]
mksh/ssv/ssv2csv [new file with mode: 0644]

diff --git a/mksh/ssv/.gitignore b/mksh/ssv/.gitignore
new file mode 100644 (file)
index 0000000..c789713
--- /dev/null
@@ -0,0 +1,2 @@
+csv2ssv
+csv2ssv.o
diff --git a/mksh/ssv/0-README b/mksh/ssv/0-README
new file mode 100644 (file)
index 0000000..6cd68d2
--- /dev/null
@@ -0,0 +1,28 @@
+SSV: shell-parseable separated values, or separator-separated values
+
+Basic idea is to make CSV into something usable:
+• newline (\x0A) is row separator
+• field separator (\x1C) is column separator
+• n̲o̲ quotes or escape characters
+• carriage return (\x0D) represents embedded newlines in cells
+
+Cell content is arbitrary binary except \x0A and \x1C (and NUL).
+
+
+SSV can easily be operated on from within shell, e.g. with:
+
+       while IFS=$'\x1C' read -A line; do
+               for x in ${!line[*]}; do
+                       print -r -- "$x=${line[x]@Q}"
+               done
+               print
+       done
+
+This sample script reads in SSV line-by-line and outputs
+each cell on a new line, with their index and quoted content.
+
+
+Apologies for the C file, but it’s much more efficient to
+do the conversion f̲r̲o̲m̲ CSV with mmap. It could be done in
+mksh of course… and I’d probably have cursed about C less
+had I used it… but now it’s there and works.
diff --git a/mksh/ssv/csv2ssv.c b/mksh/ssv/csv2ssv.c
new file mode 100644 (file)
index 0000000..8543f2d
--- /dev/null
@@ -0,0 +1,166 @@
+/*-
+ * Copyright (c) 2015
+ *     mirabilos <mirabilos@evolvis.org>
+ *
+ * Provided that these terms and disclaimer and all copyright notices
+ * are retained or reproduced in an accompanying document, permission
+ * is granted to deal in this work without restriction, including un-
+ * limited rights to use, publicly perform, distribute, sell, modify,
+ * merge, give away, or sublicence.
+ *
+ * This work is provided "AS IS" and WITHOUT WARRANTY of any kind, to
+ * the utmost extent permitted by applicable law, neither express nor
+ * implied; without malicious intent or gross negligence. In no event
+ * may a licensor, author or contributor be held liable for indirect,
+ * direct, other damage, loss, or other issues arising in any way out
+ * of dealing in the work, even if advised of the possibility of such
+ * damage or existence of a defect, except proven that it results out
+ * of said person's immediate fault when using the work as intended.
+ */
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <err.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifndef O_BINARY
+#define O_BINARY       0
+#endif
+
+#ifndef SIZE_MAX
+#ifdef SIZE_T_MAX
+#define SIZE_MAX       SIZE_T_MAX
+#else
+#define SIZE_MAX       ((size_t)-1)
+#endif
+#endif
+
+static __attribute__((__noreturn__)) void
+usage(int rv)
+{
+       fprintf(stderr, "Usage: csv2ssv [-q quotechar] [-s separator] infile\n"
+           "    Default separator is HT (Tab), quotechar is none.\n"
+           "Example: csv2ssv -q \\\" -s \\; foo.csv >foo.ssv\n");
+       exit(rv);
+}
+
+int
+main(int argc, char *argv[])
+{
+       unsigned char c;
+       const unsigned char *cp;
+       int i;
+       unsigned char cq = 0, cs = '\t';
+       const unsigned char *bp, *ep;
+       unsigned char *mp;
+       struct stat sb;
+
+       while ((i = getopt(argc, argv, "hq:s:")) != -1)
+               switch (i) {
+               case 'h':
+                       usage(0);
+               case 'q':
+                       cq = (unsigned char)optarg[0];
+                       break;
+               case 's':
+                       cs = (unsigned char)optarg[0];
+                       break;
+               default:
+                       usage(1);
+               }
+       argc -= optind;
+       argv += optind;
+
+       if (argc != 1)
+               usage(1);
+
+       if ((i = open(argv[0], O_RDONLY | O_BINARY)) == -1)
+               err(1, "%s %s", "open", argv[0]);
+       if (fstat(i, &sb))
+               err(1, "%s %s", "stat", argv[0]);
+       if (sb.st_size < 1)
+               errx(1, "%s %s", "file too small", argv[0]);
+       if ((sizeof(off_t) > sizeof(size_t)) && (sb.st_size > (off_t)SIZE_MAX))
+               errx(1, "%s %s", "file too large", argv[0]);
+       if ((mp = (unsigned char *)mmap(NULL, (size_t)sb.st_size, PROT_READ,
+           MAP_PRIVATE, i, 0)) == MAP_FAILED)
+               err(1, "%s %s", "mmap", argv[0]);
+       cp = bp = mp;
+       ep = mp + ((size_t)sb.st_size - 1);
+
+       setlinebuf(stdout);
+
+       while (cp <= ep) {
+               switch ((c = *cp++)) {
+               case 0x00:
+               case 0x1C:
+                       errx(1, "\\x%02X found at offset %zu",
+                           (unsigned int)c, (size_t)(cp - mp) - 1);
+               case 0x0A:
+                       if (cp > ep)
+                               goto nl_out;
+                       continue;
+               default:
+                       if (c == cq || c == cs)
+                               break;
+                       continue;
+               }
+
+               if ((size_t)(cp - bp) > 1)
+                       fwrite(bp, (size_t)(cp - bp) - 1, 1, stdout);
+
+               bp = cp;
+               if (c == cs) {
+                       fputc(0x1C, stdout);
+                       continue;
+               }
+
+               /* c == cq */
+               while (cp <= ep) {
+                       if (!(c = *cp++) || c == 0x1C) {
+                               errx(1, "\\x%02X found at offset %zu",
+                                   (unsigned int)c, (size_t)(cp - mp) - 1);
+                       } else if (c == cq) {
+                               /* next a quote? */
+                               if ((cp <= ep) && (*cp == cq)) {
+                                       /* yes, un-escape */
+                                       ++cp;
+                               } else
+                                       goto quote_out;
+                       } else if (c == 0x0D) {
+                               /* next a newline? */
+                               if ((cp <= ep) && (*cp == 0x0A)) {
+                                       /* yes, skip it */
+                                       ++cp;
+                               }
+                       } else if (c == 0x0A) {
+                               /* encode newline as CR */
+                               c = 0x0D;
+                       }
+                       fputc(c, stdout);
+               }
+               errx(1, "unexpected EOF within quote starting at offset %zu",
+                   (size_t)(bp - mp) - 1);
+
+ quote_out:
+               bp = cp;
+       }
+       errx(1, "unexpected EOF (newline expected)");
+
+ nl_out:
+       fwrite(bp, (size_t)(cp - bp), 1, stdout);
+       fflush(stdout);
+
+       if (munmap(mp, (size_t)sb.st_size))
+               err(2, "%s %s", "munmap", argv[0]);
+       if (close(i))
+               err(2, "%s %s", "close", argv[0]);
+
+       return (0);
+}
diff --git a/mksh/ssv/ssv2csv b/mksh/ssv/ssv2csv
new file mode 100644 (file)
index 0000000..bc28d6d
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/mksh
+#-
+# Copyright © 2015
+#      mirabilos <mirabilos@evolvis.org>
+#
+# Provided that these terms and disclaimer and all copyright notices
+# are retained or reproduced in an accompanying document, permission
+# is granted to deal in this work without restriction, including un‐
+# limited rights to use, publicly perform, distribute, sell, modify,
+# merge, give away, or sublicence.
+#
+# This work is provided “AS IS” and WITHOUT WARRANTY of any kind, to
+# the utmost extent permitted by applicable law, neither express nor
+# implied; without malicious intent or gross negligence. In no event
+# may a licensor, author or contributor be held liable for indirect,
+# direct, other damage, loss, or other issues arising in any way out
+# of dealing in the work, even if advised of the possibility of such
+# damage or existence of a defect, except proven that it results out
+# of said person’s immediate fault when using the work as intended.
+#-
+# Filter (pipe) to convert SSV into tab-separated, double-quoted CSV
+
+exec sed \
+    -e 's/"/""/g' \
+    -e 's/^/"/' \
+    -e 's/$/"/' \
+    -e $'s/\x1C/"\t"/g'