toolkit to make CSV into something that texttools can easily operate on
[shellsnippets/shellsnippets.git] / mksh / ssv / csv2ssv.c
1 /*-
2  * Copyright (c) 2015
3  *      mirabilos <mirabilos@evolvis.org>
4  *
5  * Provided that these terms and disclaimer and all copyright notices
6  * are retained or reproduced in an accompanying document, permission
7  * is granted to deal in this work without restriction, including un-
8  * limited rights to use, publicly perform, distribute, sell, modify,
9  * merge, give away, or sublicence.
10  *
11  * This work is provided "AS IS" and WITHOUT WARRANTY of any kind, to
12  * the utmost extent permitted by applicable law, neither express nor
13  * implied; without malicious intent or gross negligence. In no event
14  * may a licensor, author or contributor be held liable for indirect,
15  * direct, other damage, loss, or other issues arising in any way out
16  * of dealing in the work, even if advised of the possibility of such
17  * damage or existence of a defect, except proven that it results out
18  * of said person's immediate fault when using the work as intended.
19  */
20
21 #define _GNU_SOURCE
22 #include <sys/types.h>
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #include <err.h>
26 #include <fcntl.h>
27 #include <limits.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <unistd.h>
31
32 #ifndef O_BINARY
33 #define O_BINARY        0
34 #endif
35
36 #ifndef SIZE_MAX
37 #ifdef SIZE_T_MAX
38 #define SIZE_MAX        SIZE_T_MAX
39 #else
40 #define SIZE_MAX        ((size_t)-1)
41 #endif
42 #endif
43
44 static __attribute__((__noreturn__)) void
45 usage(int rv)
46 {
47         fprintf(stderr, "Usage: csv2ssv [-q quotechar] [-s separator] infile\n"
48             "    Default separator is HT (Tab), quotechar is none.\n"
49             "Example: csv2ssv -q \\\" -s \\; foo.csv >foo.ssv\n");
50         exit(rv);
51 }
52
53 int
54 main(int argc, char *argv[])
55 {
56         unsigned char c;
57         const unsigned char *cp;
58         int i;
59         unsigned char cq = 0, cs = '\t';
60         const unsigned char *bp, *ep;
61         unsigned char *mp;
62         struct stat sb;
63
64         while ((i = getopt(argc, argv, "hq:s:")) != -1)
65                 switch (i) {
66                 case 'h':
67                         usage(0);
68                 case 'q':
69                         cq = (unsigned char)optarg[0];
70                         break;
71                 case 's':
72                         cs = (unsigned char)optarg[0];
73                         break;
74                 default:
75                         usage(1);
76                 }
77         argc -= optind;
78         argv += optind;
79
80         if (argc != 1)
81                 usage(1);
82
83         if ((i = open(argv[0], O_RDONLY | O_BINARY)) == -1)
84                 err(1, "%s %s", "open", argv[0]);
85         if (fstat(i, &sb))
86                 err(1, "%s %s", "stat", argv[0]);
87         if (sb.st_size < 1)
88                 errx(1, "%s %s", "file too small", argv[0]);
89         if ((sizeof(off_t) > sizeof(size_t)) && (sb.st_size > (off_t)SIZE_MAX))
90                 errx(1, "%s %s", "file too large", argv[0]);
91         if ((mp = (unsigned char *)mmap(NULL, (size_t)sb.st_size, PROT_READ,
92             MAP_PRIVATE, i, 0)) == MAP_FAILED)
93                 err(1, "%s %s", "mmap", argv[0]);
94         cp = bp = mp;
95         ep = mp + ((size_t)sb.st_size - 1);
96
97         setlinebuf(stdout);
98
99         while (cp <= ep) {
100                 switch ((c = *cp++)) {
101                 case 0x00:
102                 case 0x1C:
103                         errx(1, "\\x%02X found at offset %zu",
104                             (unsigned int)c, (size_t)(cp - mp) - 1);
105                 case 0x0A:
106                         if (cp > ep)
107                                 goto nl_out;
108                         continue;
109                 default:
110                         if (c == cq || c == cs)
111                                 break;
112                         continue;
113                 }
114
115                 if ((size_t)(cp - bp) > 1)
116                         fwrite(bp, (size_t)(cp - bp) - 1, 1, stdout);
117
118                 bp = cp;
119                 if (c == cs) {
120                         fputc(0x1C, stdout);
121                         continue;
122                 }
123
124                 /* c == cq */
125                 while (cp <= ep) {
126                         if (!(c = *cp++) || c == 0x1C) {
127                                 errx(1, "\\x%02X found at offset %zu",
128                                     (unsigned int)c, (size_t)(cp - mp) - 1);
129                         } else if (c == cq) {
130                                 /* next a quote? */
131                                 if ((cp <= ep) && (*cp == cq)) {
132                                         /* yes, un-escape */
133                                         ++cp;
134                                 } else
135                                         goto quote_out;
136                         } else if (c == 0x0D) {
137                                 /* next a newline? */
138                                 if ((cp <= ep) && (*cp == 0x0A)) {
139                                         /* yes, skip it */
140                                         ++cp;
141                                 }
142                         } else if (c == 0x0A) {
143                                 /* encode newline as CR */
144                                 c = 0x0D;
145                         }
146                         fputc(c, stdout);
147                 }
148                 errx(1, "unexpected EOF within quote starting at offset %zu",
149                     (size_t)(bp - mp) - 1);
150
151  quote_out:
152                 bp = cp;
153         }
154         errx(1, "unexpected EOF (newline expected)");
155
156  nl_out:
157         fwrite(bp, (size_t)(cp - bp), 1, stdout);
158         fflush(stdout);
159
160         if (munmap(mp, (size_t)sb.st_size))
161                 err(2, "%s %s", "munmap", argv[0]);
162         if (close(i))
163                 err(2, "%s %s", "close", argv[0]);
164
165         return (0);
166 }