add a testsuite, align CR handling with Java™ code
[shellsnippets/shellsnippets.git] / mksh / ssv / csv2ssv.c
1 /*-
2  * Copyright (c) 2015, 2017
3  *      mirabilos <mirabilos@evolvis.org>
4  *
5  * Provided that these terms and disclaimer and all copyright notices
6  * are retained or reproduced in an accompanying document, permission
7  * is granted to deal in this work without restriction, including un-
8  * limited rights to use, publicly perform, distribute, sell, modify,
9  * merge, give away, or sublicence.
10  *
11  * This work is provided "AS IS" and WITHOUT WARRANTY of any kind, to
12  * the utmost extent permitted by applicable law, neither express nor
13  * implied; without malicious intent or gross negligence. In no event
14  * may a licensor, author or contributor be held liable for indirect,
15  * direct, other damage, loss, or other issues arising in any way out
16  * of dealing in the work, even if advised of the possibility of such
17  * damage or existence of a defect, except proven that it results out
18  * of said person's immediate fault when using the work as intended.
19  */
20
21 #define _GNU_SOURCE
22 #include <sys/types.h>
23 #include <sys/mman.h>
24 #include <sys/stat.h>
25 #include <err.h>
26 #include <fcntl.h>
27 #include <limits.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <unistd.h>
31
32 #ifndef O_BINARY
33 #define O_BINARY        0
34 #endif
35
36 #ifndef SIZE_MAX
37 #ifdef SIZE_T_MAX
38 #define SIZE_MAX        SIZE_T_MAX
39 #else
40 #define SIZE_MAX        ((size_t)-1)
41 #endif
42 #endif
43
44 static __attribute__((__noreturn__)) void
45 usage(int rv)
46 {
47         fprintf(stderr, "Usage: csv2ssv [-q quotechar] [-s separator] infile\n"
48             "    Default separator is HT (Tab), quotechar is none.\n"
49             "Example: csv2ssv -q \\\" -s \\; foo.csv >foo.ssv\n");
50         exit(rv);
51 }
52
53 int
54 main(int argc, char *argv[])
55 {
56         unsigned char c;
57         const unsigned char *cp;
58         int i;
59         unsigned char cq = 0, cs = '\t';
60         const unsigned char *bp, *ep;
61         unsigned char *mp;
62         struct stat sb;
63
64         while ((i = getopt(argc, argv, "hq:s:")) != -1)
65                 switch (i) {
66                 case 'h':
67                         usage(0);
68                 case 'q':
69                         cq = (unsigned char)optarg[0];
70                         break;
71                 case 's':
72                         cs = (unsigned char)optarg[0];
73                         break;
74                 default:
75                         usage(1);
76                 }
77         argc -= optind;
78         argv += optind;
79
80         if (argc != 1)
81                 usage(1);
82
83         if ((i = open(argv[0], O_RDONLY | O_BINARY)) == -1)
84                 err(1, "%s %s", "open", argv[0]);
85         if (fstat(i, &sb))
86                 err(1, "%s %s", "stat", argv[0]);
87         if (sb.st_size < 1)
88                 errx(1, "%s %s", "file too small", argv[0]);
89         if ((sizeof(off_t) > sizeof(size_t)) && (sb.st_size > (off_t)SIZE_MAX))
90                 errx(1, "%s %s", "file too large", argv[0]);
91         if ((mp = (unsigned char *)mmap(NULL, (size_t)sb.st_size, PROT_READ,
92             MAP_PRIVATE, i, 0)) == MAP_FAILED)
93                 err(1, "%s %s", "mmap", argv[0]);
94         cp = bp = mp;
95         ep = mp + ((size_t)sb.st_size - 1);
96
97         setlinebuf(stdout);
98
99         while (cp <= ep) {
100  normal:
101                 switch ((c = *cp++)) {
102                 case 0x00:
103                 case 0x1C:
104                         errx(1, "\\x%02X found at offset %zu",
105                             (unsigned int)c, (size_t)(cp - mp) - 1);
106                 case 0x0D:
107                         break;
108                 case 0x0A:
109                         if (cp > ep)
110                                 goto nl_out;
111                         continue;
112                 default:
113                         if (c == cq || c == cs)
114                                 break;
115                         continue;
116                 }
117
118                 if ((size_t)(cp - bp) > 1)
119                         fwrite(bp, (size_t)(cp - bp) - 1, 1, stdout);
120
121                 bp = cp;
122                 if (c == cs) {
123                         fputc(0x1C, stdout);
124                         continue;
125                 }
126
127                 if (c != cq) {
128                         /* 0x0D */
129                         while (cp <= ep) {
130                                 switch ((c = *cp++)) {
131                                 case 0x0A:
132                                         bp = cp;
133                                         /* FALLTHROUGH */
134                                 default:
135                                         --cp;
136                                         --bp;
137                                         goto normal;
138                                 case 0x0D:
139                                         break;
140                                 }
141                         }
142                         continue;
143                 }
144
145                 /* c == cq */
146                 while (cp <= ep) {
147                         if (!(c = *cp++) || c == 0x1C) {
148                                 errx(1, "\\x%02X found at offset %zu",
149                                     (unsigned int)c, (size_t)(cp - mp) - 1);
150                         } else if (c == cq) {
151                                 /* next a quote? */
152                                 if ((cp <= ep) && (*cp == cq)) {
153                                         /* yes, un-escape */
154                                         ++cp;
155                                 } else
156                                         goto quote_out;
157                         } else if (c == 0x0D) {
158                                 /* next a newline? */
159                                 if ((cp <= ep) && (*cp == 0x0A)) {
160                                         /* yes, skip it */
161                                         ++cp;
162                                 }
163                         } else if (c == 0x0A) {
164                                 /* encode newline as CR */
165                                 c = 0x0D;
166                         }
167                         fputc(c, stdout);
168                 }
169                 errx(1, "unexpected EOF within quote starting at offset %zu",
170                     (size_t)(bp - mp) - 1);
171
172  quote_out:
173                 bp = cp;
174         }
175         errx(1, "unexpected EOF (newline expected)");
176
177  nl_out:
178         fwrite(bp, (size_t)(cp - bp), 1, stdout);
179         fflush(stdout);
180
181         if (munmap(mp, (size_t)sb.st_size))
182                 err(2, "%s %s", "munmap", argv[0]);
183         if (close(i))
184                 err(2, "%s %s", "close", argv[0]);
185
186         return (0);
187 }