iopipe.textpipe source code

1 /**
2  Text handling with iopipe.
3 Copyright: Copyright Steven Schveighoffer 2011-.
4 License:   Boost License 1.0. (See accompanying file LICENSE_1_0.txt or copy
5            at http://www.boost.org/LICENSE_1_0.txt)
6 Authors:   Steven Schveighoffer
7  */
8 module iopipe.textpipe;
9 import iopipe.bufpipe;
10 import iopipe.traits;
11 import std.range: isRandomAccessRange, hasLength, ElementType, ElementEncodingType;
12 import std.traits: Unqual, isSomeChar, isDynamicArray, isIntegral;
13 
14 /**
15  * Used to specify stream type
16  */
17 enum UTFType
18 {
19     Unknown,
20     UTF8,
21     UTF16LE,
22     UTF16BE,
23     UTF32LE,
24     UTF32BE
25 }
26 
27 /**
28  * Aliased to code unit type of a specified stream type.
29  *
30  * `Unknown` is specified as char (UTF8 is the default)
31  */
32 template CodeUnit(UTFType u)
33 {
34     static if(u == UTFType.Unknown || u == UTFType.UTF8)
35         alias CodeUnit = char;
36     else static if(u == UTFType.UTF16LE || u == UTFType.UTF16BE)
37         alias CodeUnit = wchar;
38     else static if(u == UTFType.UTF32LE || u == UTFType.UTF32BE)
39         alias CodeUnit = dchar;
40     else
41         static assert(0);
42 }
43 
44 /**
45  * Using the given random access range of bytes, determine the stream width.
46  * This does not advance the range past the BOM.
47  *
48  * Params:
49  *    r = Range in which to detect BOM. Must be a random access range with
50  *        element type of ubyte. Cannot be an infinite range.
51  *
52  * Returns:
53  *    Instance of UTFType indicating what the BOM decoding implies.
54  */
55 UTFType detectBOM(R)(R r) if (isRandomAccessRange!R && hasLength!R && is(ElementType!R : const(ubyte)))
56 {
57     if(r.length >= 2)
58     {
59         if(r[0] == 0xFE && r[1] == 0xFF)
60             return UTFType.UTF16BE;
61         if(r[0] == 0xFF && r[1] == 0xFE)
62         {
63             if(r.length >= 4 && r[2] == 0 && r[3] == 0)
64             {
65                 // most likely UTF32
66                 return UTFType.UTF32LE;
67             }
68             return UTFType.UTF16LE;
69         }
70 
71         if(r.length >= 3 && r[0] == 0xEF && r[1] == 0xBB && r[2] == 0xBF)
72             return UTFType.UTF8;
73         if(r.length >= 4 && r[0] == 0 && r[1] == 0 && r[2] == 0xFE && r[3] == 0xFF)
74             return UTFType.UTF32BE;
75     }
76     return UTFType.Unknown;
77 }
78 
79 @safe unittest
80 {
81     with(UTFType)
82     {
83         ubyte[] BOM = [0xFE, 0XFF, 0xFE, 0, 0, 0xFE, 0xFF, 0xEF, 0xBB, 0xBF];
84         assert(BOM.detectBOM == UTF16BE);
85         assert(BOM[1 .. 4].detectBOM == UTF16LE);
86         assert(BOM[1 .. 5].detectBOM == UTF32LE);
87         assert(BOM[3 .. $].detectBOM == UTF32BE);
88         assert(BOM[7 .. $].detectBOM == UTF8);
89         assert(BOM[4 .. $].detectBOM == Unknown);
90     }
91 }
92 
93 struct DecodeableWindow(Chain, CodeUnitType)
94 {
95     Chain chain;
96     ubyte partial;
97     auto window() { return chain.window[0 .. $-partial]; }
98     void release(size_t elements) { chain.release(elements); }
99     private void determinePartial()
100     {
101         static if(is(CodeUnitType == char))
102         {
103             auto w = chain.window;
104             // ends on a multi-char sequence. Ensure it's valid.
105             // find the encoding
106 ee_outer:
107             foreach(ubyte i; 1 .. 4)
108             {
109                 import core.bitop : bsr;
110                 if(w.length < i)
111                 {
112                     // either no data, or invalid sequence.
113                     if(i > 1)
114                     {
115                         // TODO: throw some error?
116                     }
117                     partial = 0;
118                     break ee_outer;
119                 }
120                 immutable highestBit = bsr(~cast(uint)w[$ - i] & 0x0ff);
121                 switch(highestBit)
122                 {
123                 case 7:
124                     // ascii character
125                     if(i > 1)
126                     {
127                         // TODO: throw some error?
128                     }
129                     partial = 0;
130                     break ee_outer;
131                 case 6:
132                     // need to continue looking
133                     break;
134                 case 3: .. case 5:
135                         // 5 -> 2 byte sequence
136                         // 4 -> 3 byte sequence
137                         // 3 -> 4 byte sequence
138                         if(i + highestBit == 7)
139                             // complete sequence, let it pass.
140                             partial = 0;
141                         else
142                             // skip these, the whole sequence isn't there yet.
143                             partial = i;
144                         break ee_outer;
145                 default:
146                         // invalid sequence, let it fail
147                         // TODO: throw some error?
148                         partial = 0;
149                         break ee_outer;
150                 }
151             }
152         }
153         else // wchar
154         {
155             // if the last character is in 0xD800 - 0xDBFF, then it is
156             // the first wchar of a surrogate pair. This means we must
157             // leave it off the end.
158             partial = chain.window.length > 0 && (chain.window[$-1] & 0xFC00) == 0xD800 ? 1 : 0;
159         }
160     }
161     size_t extend(size_t elements)
162     {
163         auto origWindowSize = window.length;
164         cast(void)chain.extend(elements > partial ? elements - partial : elements);
165         determinePartial();
166         // TODO: may need to loop if we are getting one char at a time.
167         return window.length - origWindowSize;
168     }
169 
170     mixin implementValve!chain;
171 }
172 
173 /**
174  * Wraps a text-based iopipe to make sure all code units are decodeable.
175  *
176  * When an iopipe is made up of character types, in some cases a slice of the
177  * window may not be completely decodeable. For example, a wchar iopipe may
178  * have only one half of a surrogate pair at the end of the window.
179  *
180  * This function generates an iopipe that only allows completely decodeable
181  * sequences to be released to the next iopipe.
182  *
183  * Params:
184  *    c = The iopipe whose element type is one of char, wchar, or dchar.
185  * 
186  * Returns:
187  *    An appropriate iopipe that ensures decodeability. Note that dchar iopipes
188  *    are always decodeable, so the result is simply a return of the input. 
189  */
190 auto ensureDecodeable(Chain)(Chain c) if (isIopipe!Chain && isSomeChar!(ElementEncodingType!(WindowType!Chain)))
191 {
192     import std.traits: Unqual;
193     alias CodeUnitType = Unqual!(ElementEncodingType!(WindowType!Chain));
194 
195     // need to stop chaining if the last thing was an ensureDecodable. Of
196     // course, it's very hard to check if the type is a DecodeableWindow. What
197     // we do is pretend to wrap c's upstream chain, and see if it results in
198     // the exact type we were passed. If this is the case, then it must be a
199     // type that was wrapped with a DecodableWindow.
200     static if(is(CodeUnitType == dchar))
201     {
202         // always decodeable
203         return c;
204     }
205     else static if(__traits(hasMember, Chain, "chain") &&
206                    is(typeof(.ensureDecodeable(c.chain)) == Chain))
207     {
208         return c;
209     }
210     else
211     {
212         auto r = DecodeableWindow!(Chain, CodeUnitType)(c);
213         r.determinePartial();
214         return r;
215     }
216 }
217 
218 @safe unittest
219 {
220     // check that ensureDecodeable just returns itself when called twice
221     auto str = "hello";
222     auto d1 = str.ensureDecodeable;
223     auto d2 = d1.ensureDecodeable;
224     static assert(is(typeof(d1) == typeof(d2)));
225 }
226 
227 /**
228  * Given an ioPipe whose window is a buffer that is a dynamic array of data of
229  * integral type, performs the proper transformations in order to get a buffer
230  * of valid char, wchar, or dchar elements, depending on the provided encoding.
231  * This function is useful for when you have data from a raw source (such as a
232  * file or stream) that you have determined or know is really a stream of UTF
233  * data.
234  *
235  * If the data must be byte-swapped, then it must be mutable. Otherwise,
236  * immutable or const data is allowed.
237  *
238  * Params:
239  *    enc = The assumed encoding of the text pipe.
240  *    c = The chain to assume the encoding for. This MUST have a dynamic
241  *            array type for its window, and the elements must be integral.
242  * Returns:
243  *    An appropriate iopipe that has a window of the appropriate character type
244  *    (`char`, `wchar`, or `dchar`) for the assumed encoding. The window will
245  *    be set up so its elements are properly byte-ordered for the compiled
246  *    platform.
247  */
248 auto assumeText(UTFType enc = UTFType.UTF8, Chain)(Chain c) if (isIopipe!Chain && isDynamicArray!(WindowType!Chain) && isIntegral!(ElementEncodingType!(WindowType!Chain)))
249 {
250     static if(enc == UTFType.UTF8 || enc == UTFType.Unknown)
251         return c.arrayCastPipe!char;
252     else static if(enc == UTFType.UTF16LE || enc == UTFType.UTF32LE)
253     {
254         return c.arrayCastPipe!(CodeUnit!enc).byteSwapper!true;
255     }
256     else static if(enc == UTFType.UTF16BE || enc == UTFType.UTF32BE)
257     {
258         return c.arrayCastPipe!(CodeUnit!enc).byteSwapper!false;
259     }
260     else
261         static assert(0);
262 }
263 
264 @safe unittest
265 {
266     import std.algorithm : equal;
267     import core.bitop : bswap;
268     // standard char array, casted to ubyte (typical case)
269     ubyte[] str1 = ['h', 'e', 'l', 'l', 'o'];
270     
271     auto p1 = str1.assumeText!(UTFType.UTF8);
272     static assert(is(WindowType!(typeof(p1)) == char[]));
273     assert("hello".equal(p1.window));
274 
275     // build a byte-swapped array for "hello"
276     uint[] str2 = ['h', 'e', 'l', 'l', 'o'];
277     foreach(ref i; str2)
278         i = bswap(i);
279 
280     // encoding should be utf32, in non-native endianness.
281     version(BigEndian)
282     {
283         enum encType = UTFType.UTF32LE;
284     }
285     else
286     {
287         enum encType = UTFType.UTF32BE;
288     }
289 
290     auto p2 = str2.assumeText!encType;
291     static assert(is(WindowType!(typeof(p2)) == dchar[]));
292     assert("hello".equal(p2.window));
293 }
294 
295 private struct DelimitedTextPipe(Chain)
296 {
297     alias CodeUnitType = Unqual!(typeof(Chain.init.window[0]));
298     private
299     {
300         Chain chain;
301         size_t checked;
302         size_t _segments;
303         bool endsWithDelim;
304         CodeUnitType[dchar.sizeof / CodeUnitType.sizeof] delimElems;
305 
306         static if(is(CodeUnitType == dchar))
307         {
308             enum validDelimElems = 1;
309             enum skippableElems = 1;
310         }
311         else
312         {
313             // number of elements in delimElems that are valid
314             ubyte validDelimElems;
315             // number of elements that can be skipped if the sequence fails
316             // to match. This basically is the number of elements that are
317             // not the first element (except the first element of course).
318             ubyte skippableElems;
319         }
320     }
321 
322     auto window() { return chain.window[0 .. checked]; }
323     ubyte delimTrailer() { return endsWithDelim ? validDelimElems : 0; }
324     void release(size_t elements)
325     {
326         checked -= elements;
327         chain.release(elements);
328     }
329 
330     size_t extend(size_t elements = 0)
331     {
332         auto newChecked = checked;
333         endsWithDelim = false;
334         const ve = validDelimElems;
335         {
336             // scan for first delimiter element
337 byline_outer_1:
338             do
339             {
340                 // make sure we don't even get into a check if we don't have enough elements to check.
341                 auto w = chain.window;
342                 if(newChecked + ve > w.length)
343                     continue;
344                 immutable t = delimElems[0];
345                 static if(isDynamicArray!(WindowType!(Chain)))
346                 {
347                     if(__ctfe)
348                     {
349                         // don't use pointer tricks or memchr
350 ctfe_while:
351                         while(newChecked + ve <= w.length)
352                         {
353                             if(w[newChecked++] == t)
354                             {
355                                 // found first element, look for the others
356                                 foreach(i; 1 .. ve)
357                                     if(w[newChecked] != delimElems[i])
358                                         continue ctfe_while;
359                                     else
360                                         ++newChecked;
361                                 endsWithDelim = true;
362                                 break byline_outer_1;
363                             }
364                         }
365                         continue;
366                     }
367 
368                     // search for the first delimiter element using @system methods
369                     bool search() @trusted
370                     {
371                         auto p = w.ptr + newChecked;
372                         static if(CodeUnitType.sizeof == 1)
373                         {
374                             // can use memchr
375                             import core.stdc.string: memchr;
376                             // should be true because we check at the beginning of the loop.
377                             assert(newChecked + (ve - 1) <= w.length);
378                             auto delimp = cast(typeof(p))memchr(p, t, w.length - newChecked - (ve - 1));
379                             if(delimp != null)
380                             {
381                                 // found it
382                                 newChecked = delimp + 1 - w.ptr;
383                                 return true;
384                             }
385                         }
386                         else
387                         {
388                             auto e = w.ptr + w.length - (ve - 1);
389                             while(p < e)
390                             {
391                                 if(*p++ == t)
392                                 {
393                                     // found it
394                                     newChecked = p - w.ptr;
395                                     return true;
396                                 }
397                             }
398                         }
399                         return false;
400                     }
401 
402                     if(search())
403                     {
404                         // found the first delimeter element. If multiple exist,
405                         // we need to check those as well.
406                         if(ve != 1)
407                         {
408                             size_t i = 1;
409                             while(i < ve)
410                             {
411                                 // TODO: should we optimize this?
412                                 if(w[newChecked] != delimElems[i])
413                                     break;
414                                 ++newChecked;
415                                 ++i;
416                             }
417                             if(i == ve)
418                             {
419                                 endsWithDelim = true;
420                                 break byline_outer_1;
421                             }
422                         }
423                         else
424                         {
425                             endsWithDelim = true;
426                             break byline_outer_1;
427                         }
428                     }
429                     else
430                         newChecked = w.length - (ve - 1);
431                 }
432                 else
433                 {
434 generic_range_while:
435                     while(newChecked + ve <= w.length)
436                     {
437                         if(w[newChecked++] == t)
438                         {
439                             // found first element, look for the others
440                             foreach(i; 1 .. ve)
441                                 if(w[newChecked] != delimElems[i])
442                                     continue generic_range_while;
443                                 else
444                                     ++newChecked;
445                             endsWithDelim = true;
446                             break byline_outer_1;
447                         }
448                     }
449                 }
450             } while(chain.extend(elements) != 0);
451 
452             if(!endsWithDelim)
453             {
454                 // ran out of data
455                 newChecked = chain.window.length;
456             }
457         }
458 
459         auto prevChecked = checked;
460         if(checked != newChecked)
461         {
462             ++_segments;
463             checked = newChecked;
464         }
465         return checked - prevChecked;
466     }
467 
468     size_t segments() { return _segments; }
469 
470     mixin implementValve!chain;
471 }
472 
473 /**
474  * Process a given text iopipe by a given code point delimeter. The only
475  * behavior that changes from the input pipe is that extensions to the window
476  * deliever exactly one more delimited segment of text.
477  *
478  * Params:
479  *    c = The input text iopipe. This must have a window whose elements are
480  *        valid character types.
481  *    delim = The code point with which to delimit the text. Each extension to
482  *        the iopipe will either end on this delimiter, or will be the last
483  *        segment in the pipe.
484  * Returns:
485  *    An iopipe that behaves as described above.
486  */
487 auto delimitedText(Chain)(Chain c, dchar delim = '\n')
488    if(isIopipe!Chain &&
489       isSomeChar!(ElementEncodingType!(WindowType!Chain)))
490 {
491     import std.traits: Unqual;
492     auto result = DelimitedTextPipe!(Chain)(c);
493     // set up the delimeter
494     static if(is(result.CodeUnitType == dchar))
495     {
496         result.delimElems[0] = delim;
497     }
498     else
499     {
500         import std.utf: encode;
501         result.validDelimElems = cast(ubyte)encode(result.delimElems, delim);
502         result.skippableElems = 1; // need to be able to skip at least one element
503         foreach(x; result.delimElems[1 .. result.validDelimElems])
504         {
505             if(x == result.delimElems[0])
506                 break;
507             ++result.skippableElems;
508         }
509     }
510     return result;
511 }
512 
513 @safe unittest
514 {
515     static void testIt(X)(X p)
516     {
517         p.extend;
518         assert(p.window == "hello ");
519         p.extend;
520         assert(p.window == "hello world, ");
521         p.extend;
522         assert(p.window == "hello world, this ");
523         assert(p.segments == 3);
524         assert(p.delimTrailer == 1);
525         p.process();
526         assert(p.segments == 6);
527         assert(p.delimTrailer == 0);
528     }
529     testIt("hello world, this is a test".delimitedText(' '));
530     // bug #32
531     testIt(SimplePipe!string("hello world, this is a test").delimitedText(' '));
532 
533     // add valve support
534     import iopipe.valve;
535     auto p2 = "hello world".simpleValve.delimitedText(' ');
536     auto orig = p2.valve;
537     assert(orig == "hello world");
538 }
539 
540 /**
541  * A convenience wrapper for delimitedText that uses the newline character '\n'
542  * to delimit the segments. Equivalent to `delimitedText(c, '\n');`
543  *
544  * Params:
545  *    c = The input text iopipe. This must have a window whose elements are
546  *        valid character types.
547  * Returns:
548  *    A line delimited iopipe.
549  */
550 auto byLine(Chain)(Chain c)
551 {
552     return delimitedText(c, '\n');
553 }
554 
555 // same as a normal range, but we don't return the delimiter.
556 // Note that the Chain MUST be a ByDelim iopipe.
557 private struct NoDelimRange(Chain)
558 {
559     Chain chain;
560     ubyte delimElems;
561     bool empty() { return chain.window.length == 0; }
562     auto front() { return chain.window[0 .. $ - delimElems]; }
563     void popFront()
564     {
565         chain.release(chain.window.length);
566         chain.extend(0);
567         delimElems = chain.delimTrailer;
568     }
569 }
570 
571 /**
572  * Given a text iopipe, returns a range based on splitting the text by a given
573  * code point. This has the advantage over `delimitedText.asRange` in that the
574  * delimiter can be hidden.
575  *
576  * Params:
577  *     KeepDelimiter = If true, then the delimiter is included in each element
578  *        of the range (if present from the original iopipe).
579  *     c = The iopipe to range-ify.
580  *     delim = The dchar to use for delimiting.
581  * Returns:
582  *     An input range whose elements are the delimited text segments, with or
583  *     without delimiters as specified by the KeepDelimiter boolean.
584  */
585 
586 auto byDelimRange(bool KeepDelimiter = false, Chain)(Chain c, dchar delim)
587    if(isIopipe!Chain &&
588       is(Unqual!(ElementType!(WindowType!Chain)) == dchar))
589 {
590     auto p = c.delimitedText(delim);
591     static if(KeepDelimiter)
592     {
593         // just use standard input range adapter
594         return p.asInputRange;
595     }
596     else
597     {
598         auto r = NoDelimRange!(typeof(p))(p);
599         // pre-fetch first line
600         r.popFront();
601         return r;
602     }
603 }
604 
605 /**
606  * Convenience wrapper for byDelimRange that uses the newline character '\n' as
607  * the delimiter. Equivalent to `byDelimRange!(KeepDelimiter)(c, '\n');
608  *
609  * Params:
610  *     KeepDelimiter = If true, then the delimiter is included in each element
611  *        of the range (if present from the original iopipe).
612  *     c = The iopipe to range-ify.
613  * Returns:
614  *     An input range whose elements are lines of text from the input iopipe,
615  *     with or without delimiters as specified by the KeepDelimiter boolean.
616  */
617 
618 auto byLineRange(bool KeepDelimiter = false, Chain)(Chain c)
619 {
620     return byDelimRange!(KeepDelimiter)(c, '\n');
621 }
622 
623 @safe unittest
624 {
625     import std.algorithm : equal;
626     assert("hello\nworld".byLineRange.equal(["hello", "world"]));
627     assert("hello\nworld".byLineRange!true.equal(["hello\n", "world"]));
628     assert("\nhello\nworld".byLineRange.equal(["", "hello", "world"]));
629     assert("\nhello\nworld".byLineRange!true.equal(["\n", "hello\n", "world"]));
630     assert("\nhello\nworld\n".byLineRange.equal(["", "hello", "world"]));
631     assert("\nhello\nworld\n".byLineRange!true.equal(["\n", "hello\n", "world\n"]));
632 }
633 
634 static struct TextOutput(Chain)
635 {
636     Chain chain;
637     alias CT = typeof(Chain.init.window[0]);
638 
639     // TODO: allow putting of strings
640 
641     void put(A)(A c)
642     {
643         import std.utf;
644         static if(A.sizeof == CT.sizeof)
645         {
646             // output the data directly to the output stream
647             if(chain.ensureElems(1) == 0)
648                 assert(0);
649             chain.window[0] = c;
650             chain.release(1);
651         }
652         else
653         {
654             static if(is(CT == char))
655             {
656                 static if(is(A : const(wchar)))
657                 {
658                     // A is a wchar.  Make sure it's not a surrogate pair
659                     // (that it's a valid dchar)
660                     if(!isValidDchar(c))
661                         assert(0);
662                 }
663                 // convert the character to utf8
664                 if(c <= 0x7f)
665                 {
666                     if(chain.ensureElems(1) == 0)
667                         assert(0);
668                     chain.window[0] = cast(char)c;
669                     chain.release(1);
670                 }
671                 else
672                 {
673                     char[4] buf = void;
674                     auto idx = 3;
675                     auto mask = 0x3f;
676                     dchar c2 = c;
677                     while(c2 > mask)
678                     {
679                         buf[idx--] = 0x80 | (c2 & 0x3f);
680                         c2 >>= 6;
681                         mask >>= 1;
682                     }
683                     buf[idx] = (c2 | (~mask << 1)) & 0xff;
684                     const elems = buf.length - idx;
685                     if(chain.ensureElems(elems) < elems)
686                         assert(0);
687                     chain.window[0 .. elems] = buf[idx .. $];
688                     chain.release(elems);
689                 }
690             }
691             else static if(is(CT == wchar))
692             {
693                 static if(is(A : const(char)))
694                 {
695                     // this is a utf-8 character, only works if it's an
696                     // ascii character
697                     if(c > 0x7f)
698                         throw new Exception("invalid character output");
699                 }
700                 // convert the character to utf16
701                 assert(isValidDchar(c));
702                 if(c < 0xFFFF)
703                 {
704                     if(chain.ensureElems(1) == 0)
705                         assert(0);
706                     chain.window[0] = cast(wchar)c;
707                     chain.release(1);
708                 }
709                 else
710                 {
711                     if(chain.ensureElems(2) < 2)
712                         assert(0);
713                     wchar[2] buf = void;
714                     dchar dc = c - 0x10000;
715                     buf[0] = cast(wchar)(((dc >> 10) & 0x3FF) + 0xD800);
716                     buf[1] = cast(wchar)((dc & 0x3FF) + 0xDC00);
717                     chain.window[0..2] = buf;
718                     chain.release(2);
719                 }
720             }
721             else static if(is(CT == dchar))
722             {
723                 static if(is(A : const(char)))
724                 {
725                     // this is a utf-8 character, only works if it's an
726                     // ascii character
727                     if(c > 0x7f)
728                         throw new Exception("invalid character output");
729                 }
730                 else static if(is(A : const(wchar)))
731                 {
732                     // A is a wchar.  Make sure it's not a surrogate pair
733                     // (that it's a valid dchar)
734                     if(!isValidDchar(c))
735                         throw new Exception("invalid character output");
736                 }
737                 // converting to utf32, just write directly
738                 if(chain.ensureElems(1) == 0)
739                     assert(0);
740                 chain.window[0] = c;
741                 chain.release(1);
742             }
743             else
744                 static assert(0, "invalid types used for output stream, " ~ CT.stringof ~ ", " ~ C.stringof);
745         }
746     }
747 }
748 
749 /**
750  * Take a text-based iopipe and turn it into an output range of `dchar`. Note
751  * that the iopipe must be an output iopipe, not an input one. In other words,
752  * a `textOutput` result doesn't output its input, it uses its input as a place
753  * to deposit data.
754  *
755  * The given iopipe window will be written to, then data that is ready to be
756  * output is released. It is expected that the iopipe will use this mechanism
757  * to actually know which data to output. See the example for more information.
758  *
759  * Params:
760  *     c = The output iopipe that can be used to put dchars into.
761  * Returns:
762  *     An output range that can accept all forms of text data for output.
763  */
764 auto textOutput(Chain)(Chain c)
765 {
766     // create an output range of dchar/code units around c. We assume releasing and
767     // extending c will properly output the data.
768 
769     return TextOutput!Chain(c);
770 }
771 
772 ///
773 @safe unittest
774 {
775     import std.range : put;
776     // use a writeable buffer as output.
777     char[256] buffer;
778     size_t written = 0;
779 
780     // this helps us see how many chars are written.
781     struct LocalIopipe
782     {
783         char[] window;
784         void release(size_t elems)
785         {
786             window.release(elems);
787             written += elems;
788         }
789         size_t extend(size_t elems) { return 0; }
790     }
791     auto oRange = LocalIopipe(buffer[]).textOutput;
792     put(oRange, "hello, world");
793 
794     // written is updated whenever the iopipe is released
795     assert(buffer[0 .. written] == "hello, world");
796 }
797 
798 /**
799  * Convert iopipe of one text type into an iopipe for another type. Performs
800  * conversions at the code-point level. If specified, the resulting iopipe will
801  * ensure there is a BOM at the beginning of the iopipe. This is useful if
802  * writing to storage.
803  *
804  * If no conversion is necessary, and no BOM is required, the original iopipe
805  * is returned.
806  *
807  * Params:
808  *     Char = The desired character type in the resulting iopipe. Must be one
809  *           of char, wchar, or dchar.
810  *     ensureBOM = If true, the resulting iopipe will ALWAYS have a byte order
811  *           mark at the beginning of the stream. At the moment this is
812  *           accomplished by copying all the data from the original iopipe to
813  *           the new one. A better mechanism is being worked on.
814  *     chain = The source iopipe.
815  * Returns:
816  *     An iopipe which fulfills the given requirements.
817  *
818  */
819 auto convertText(Char = char, bool ensureBOM = false, Chain)(Chain chain) if (isSomeChar!Char)
820 {
821     static if(!ensureBOM && is(ElementEncodingType!(WindowType!(Chain)) == Char))
822         return chain;
823     else
824         return chain.textConverter!ensureBOM.bufd!Char;
825 }
826 
827 @safe unittest
828 {
829     // test converting char[] to wchar[]
830     auto inpipe = "hello";
831     immutable(ushort)[] expected = cast(immutable(ushort)[])"\ufeffhello"w;
832     auto wpipe = inpipe.convertText!wchar;
833     static assert(is(WindowType!(typeof(wpipe)) == wchar[]));
834 
835     wpipe.extend(100);// fill the pipe
836     assert(wpipe.window.length == 5);
837     assert(cast(ushort[])wpipe.window == expected[1 .. $]);
838 
839     // ensure the BOM
840     auto wpipe2 = inpipe.convertText!(wchar, true);
841     wpipe2.extend(100);
842     assert(wpipe2.window.length == 6);
843     assert(cast(ushort[])wpipe2.window == expected);
844 }
845 
846 
847 /**
848  * A converter to allow conversion into any other type of text.
849  *
850  * The converter does 2 things. First and foremost, it adds a read function
851  * that allows conversion into any other width of text. The read function
852  * converts as much text as possible into the given format, extending the base
853  * iopipe as necessary.
854  *
855  * The second thing that it does is potentially add a BOM character to the
856  * beginning of the text. It was decided to add this here, since you are likely
857  * already copying data from one iopipe into another. However, in future
858  * versions, this capability may go away, as we can do this elsewhere with less
859  * copying. So expect this API to change.
860  */
861 template textConverter(bool ensureBOM = false, Chain)
862 {
863     struct TextConverter
864     {
865         Chain chain;
866         static if(ensureBOM)
867         {
868             bool atBeginning = true;
869 
870             auto release(size_t elems)
871             {
872                 atBeginning = atBeginning && elems == 0;
873                 return chain.release(elems);
874             }
875         }
876 
877         size_t read(Char)(Char[] buf)
878         {
879             alias SrcChar = ElementEncodingType!(WindowType!(Chain));
880             if(buf.length == 0)
881                 return 0;
882             // first step, check to see if the first code point is a BOM
883             size_t result = 0;
884             static if(ensureBOM)
885             {
886                 if(atBeginning)
887                 {
888                     // utf8 bom is 3 code units, in other char types, it's only 1.
889                     bool addBOM = true;
890                     static if(is(Unqual!SrcChar == char))
891                     {
892                         if(chain.window.length < 3)
893                             chain.extend(0);
894                         if(chain.window.length == 0)
895                             return 0; // special case, don't insert a BOM for a blank file.
896                         if(chain.window.length >= 3 &&
897                            chain.window[0] == 0xef &&
898                            chain.window[1] == 0xbb &&
899                            chain.window[2] == 0xbf)
900                         {
901                             addBOM = false;
902                         }
903                     }
904                     else
905                     {
906                         if(chain.window.length < 1)
907                             if(chain.extend(0) == 0)
908                                 return 0; // special case, don't insert a BOM for a blank file.
909                         if(chain.window[0] == 0xfeff)
910                             addBOM = false;
911                     }
912 
913                     if(addBOM)
914                     {
915                         // write the BOM to the given buffer
916                         static if(is(Char == char))
917                         {
918                             buf[0] = 0xef;
919                             buf[1] = 0xbb;
920                             buf[2] = 0xbf;
921 
922                             result = 3;
923                             buf = buf[3 .. $];
924                         }
925                         else
926                         {
927                             buf[0] = 0xfeff;
928                             result = 1;
929                             buf = buf[1 .. $];
930                         }
931                     }
932                 }
933             }
934             static if(is(Unqual!Char == Unqual!SrcChar))
935             {
936                 import std.algorithm.mutation: copy;
937                 import std.algorithm.comparison: max;
938                 // try an extend when window length gets to be less than read size.
939                 if(chain.window.length < buf.length)
940                     chain.extend(buf.length - chain.window.length);
941                 if(chain.window.length == 0)
942                     // no more data
943                     return 0;
944                 immutable len = max(chain.window.length, buf.length);
945                 copy(chain.window[0 .. len], buf[0 .. len]);
946                 chain.release(len);
947                 return result + len;
948             }
949             else
950             {
951                 // need to transcode each code point.
952                 import std.utf;
953                 auto win = chain.window;
954                 size_t pos = 0;
955                 bool didExtend = false;
956                 bool eof = false;
957                 while(buf.length > 0)
958                 {
959                     enum minValidElems = is(Unqual!Char == dchar) ? 1 : 4;
960                     if(!eof && pos + minValidElems > chain.window.length)
961                     {
962                         if(!didExtend)
963                         {
964                             didExtend = true;
965                             // give the upstream pipe some buffer space
966                             chain.release(pos);
967                             pos = 0;
968                             if(chain.extend(0))
969                             {
970                                 win = chain.window;
971                                 continue;
972                             }
973                             win = chain.window;
974                             // else, we aren't going to get any more data. decode as needed.
975                             eof = true;
976                         }
977                         else
978                             // don't decode any more. We can wait until next time.
979                             break;
980                     }
981                     if(pos == win.length)
982                         // end of the stream
983                         break;
984                     // decode a code point
985                     auto oldPos = pos;
986                     dchar dc;
987                     dc = decode(win, pos);
988                     // encode the dchar into a new item
989                     Char[dchar.sizeof / Char.sizeof] encoded;
990                     auto nChars = encode(encoded, dc);
991                     if(nChars > buf.length)
992                     {
993                         // read as much as we could.
994                         pos = oldPos;
995                         break;
996                     }
997                     if(nChars == 1)
998                         buf[0] = encoded[0];
999                     else
1000                         buf[0 .. nChars] = encoded[0 .. nChars];
1001                     result += nChars;
1002                     buf = buf[nChars .. $];
1003                 }
1004 
1005                 // release the chain data that we have processed.
1006                 chain.release(pos);
1007                 return result;
1008             }
1009         }
1010         alias chain this;
1011     }
1012 
1013     auto textConverter(Chain c)
1014     {
1015         return TextConverter(c);
1016     }
1017 }
1018 
1019 /**
1020  * Encode a given text iopipe into the desired encoding type. The resulting
1021  * iopipe's element type is ubyte, with the bytes ready to be written to a
1022  * storage device.
1023  *
1024  * Params:
1025  *     enc = The encoding type to use.
1026  *     c = The source iopipe. Must be an iopipe where the window type's element
1027  *          type is text based.
1028  * Returns:
1029  *     A ubyte iopipe that represents the encoded version of the input iopipe
1030  *     based on the provided encoding.
1031  */
1032 auto encodeText(UTFType enc = UTFType.UTF8, Chain)(Chain c)
1033 {
1034     auto converted = c.convertText!(CodeUnit!enc);
1035 
1036     static if(enc == UTFType.UTF8)
1037     {
1038         return converted.arrayCastPipe!ubyte;
1039     }
1040     else static if(enc == UTFType.UTF16LE || enc == UTFType.UTF32LE)
1041     {
1042         return converted.byteSwapper!(true).arrayCastPipe!ubyte;
1043     }
1044     else static if(enc == UTFType.UTF16BE || enc == UTFType.UTF32BE)
1045     {
1046         return converted.byteSwapper!(false).arrayCastPipe!ubyte;
1047     }
1048     else
1049         assert(0);
1050 }
1051 
1052 @safe unittest
1053 {
1054     import core.bitop : bswap;
1055     // ensure that we properly byteswap.
1056     auto input = "hello";
1057 
1058     version(BigEndian)
1059         enum encodingType = UTFType.UTF32LE;
1060     else
1061         enum encodingType = UTFType.UTF32BE;
1062 
1063     auto testme = input.encodeText!encodingType;
1064     static assert(is(WindowType!(typeof(testme)) == ubyte[]));
1065 
1066     uint[] expected = cast(uint[])"hello"d.dup;
1067     foreach(ref v; expected) v = bswap(v);
1068 
1069     testme.extend(100);
1070     assert(testme.window == cast(ubyte[])expected);
1071 }
1072 
1073 /**
1074  * Given a template function, and an input chain of encoded text data, this
1075  * function will detect the encoding of the input chain, and convert that
1076  * runtime value into a compile-time parameter to the given function. Useful
1077  * for writing code that needs to handle all the forms of text encoding.
1078  *
1079  * Use the encoding type as a parameter to assumeText to get an iopipe of
1080  * `char`, `wchar`, or `dchar` elements for processing.
1081  *
1082  * Note that func must return the same type no matter how it's called, as the
1083  * BOM detection and calling is done at runtime. Given that there are 5
1084  * different encodings that iopipe handles, you will have 6 instantiations of
1085  * the function, no matter whether the input contains that encoding or not.
1086  *
1087  * The second version assumes that the function doesn't care what the encoding
1088  * is, but just wants to get a text iopipe with the appropriate encoding
1089  * already handled. In this case, the function will receive a chain of `char`,
1090  * `wchar`, or `dchar` window elements.
1091  *
1092  * Params:
1093  *     func = The template function to call.
1094  *     UnknownIsUTF8 = If true, then an undetected encoding will be passed as
1095  *          UTF8 to your function. Otherwise, the Unknown encoding will be passed.
1096  *     c = The iopipe input chain that should have encoded text in it.
1097  *     args = Any optional args to pass to the function.
1098  * Returns:
1099  *     The return value from func.
1100  */
1101 auto ref runWithEncoding(alias func, bool UnknownIsUTF8 = true, Chain, Args...)(Chain c, auto ref Args args)
1102     if(isIopipe!Chain && is(typeof(detectBOM(c.window))))
1103 {
1104     // first, detect the encoding
1105     c.ensureElems(4);
1106     import std.traits: EnumMembers;
1107     auto bom = c.window.detectBOM;
1108     final switch(bom)
1109     {
1110         // BUG: static foreach should work, but doesn't, waiting for issue 17807 to
1111         // make it into a release.
1112         /*static*/ foreach(enc; EnumMembers!UTFType)
1113         {
1114         case enc:
1115             static if(UnknownIsUTF8 && enc == UTFType.Unknown)
1116                 goto case UTFType.UTF8;
1117             else
1118                 return func!(enc)(c, args);
1119         }
1120     }
1121 }
1122 
1123 /// Ditto
1124 auto ref runEncoded(alias func, Chain, Args...)(Chain c, auto ref Args args)
1125 {
1126     static auto ref forwarder(UTFType enc)(Chain c, auto ref Args args)
1127     {
1128         return func(c.assumeText!enc, args);
1129     }
1130 
1131     return runWithEncoding!forwarder(c, args);
1132 }
1133 
1134 // TODO: need unit tests here.
1135 //
1136 
1137 @safe nothrow unittest
1138 {
1139     // try running byline range with a string in CTFE
1140     import std.range : walkLength;
1141     static void testit(Char)()
1142     {
1143         enum immutable(Char)[] str = "hello\nworld\nthis\nis\na\ntest";
1144         static assert(str.byLineRange.walkLength == 6);
1145     }
1146 
1147     testit!char();
1148     testit!wchar();
1149     testit!dchar();
1150 }