1 /** 2 Text handling with iopipe. 3 Copyright: Copyright Steven Schveighoffer 2011-. 4 License: Boost License 1.0. (See accompanying file LICENSE_1_0.txt or copy 5 at http://www.boost.org/LICENSE_1_0.txt) 6 Authors: Steven Schveighoffer 7 */ 8 module iopipe.textpipe; 9 import iopipe.bufpipe; 10 import iopipe.traits; 11 import std.range: isRandomAccessRange, hasLength, ElementType, ElementEncodingType; 12 import std.traits: Unqual, isSomeChar, isDynamicArray, isIntegral; 13 14 /** 15 * Used to specify stream type 16 */ 17 enum UTFType 18 { 19 Unknown, 20 UTF8, 21 UTF16LE, 22 UTF16BE, 23 UTF32LE, 24 UTF32BE 25 } 26 27 /** 28 * Aliased to code unit type of a specified stream type. 29 * 30 * `Unknown` is specified as char (UTF8 is the default) 31 */ 32 template CodeUnit(UTFType u) 33 { 34 static if(u == UTFType.Unknown || u == UTFType.UTF8) 35 alias CodeUnit = char; 36 else static if(u == UTFType.UTF16LE || u == UTFType.UTF16BE) 37 alias CodeUnit = wchar; 38 else static if(u == UTFType.UTF32LE || u == UTFType.UTF32BE) 39 alias CodeUnit = dchar; 40 else 41 static assert(0); 42 } 43 44 /** 45 * Using the given random access range of bytes, determine the stream width. 46 * This does not advance the range past the BOM. 47 * 48 * Params: 49 * r = Range in which to detect BOM. Must be a random access range with 50 * element type of ubyte. Cannot be an infinite range. 51 * 52 * Returns: 53 * Instance of UTFType indicating what the BOM decoding implies. 54 */ 55 UTFType detectBOM(R)(R r) if (isRandomAccessRange!R && hasLength!R && is(ElementType!R : const(ubyte))) 56 { 57 if(r.length >= 2) 58 { 59 if(r[0] == 0xFE && r[1] == 0xFF) 60 return UTFType.UTF16BE; 61 if(r[0] == 0xFF && r[1] == 0xFE) 62 { 63 if(r.length >= 4 && r[2] == 0 && r[3] == 0) 64 { 65 // most likely UTF32 66 return UTFType.UTF32LE; 67 } 68 return UTFType.UTF16LE; 69 } 70 71 if(r.length >= 3 && r[0] == 0xEF && r[1] == 0xBB && r[2] == 0xBF) 72 return UTFType.UTF8; 73 if(r.length >= 4 && r[0] == 0 && r[1] == 0 && r[2] == 0xFE && r[3] == 0xFF) 74 return UTFType.UTF32BE; 75 } 76 return UTFType.Unknown; 77 } 78 79 @safe unittest 80 { 81 with(UTFType) 82 { 83 ubyte[] BOM = [0xFE, 0XFF, 0xFE, 0, 0, 0xFE, 0xFF, 0xEF, 0xBB, 0xBF]; 84 assert(BOM.detectBOM == UTF16BE); 85 assert(BOM[1 .. 4].detectBOM == UTF16LE); 86 assert(BOM[1 .. 5].detectBOM == UTF32LE); 87 assert(BOM[3 .. $].detectBOM == UTF32BE); 88 assert(BOM[7 .. $].detectBOM == UTF8); 89 assert(BOM[4 .. $].detectBOM == Unknown); 90 } 91 } 92 93 struct DecodeableWindow(Chain, CodeUnitType) 94 { 95 Chain chain; 96 ubyte partial; 97 auto window() { return chain.window[0 .. $-partial]; } 98 void release(size_t elements) { chain.release(elements); } 99 private void determinePartial() 100 { 101 static if(is(CodeUnitType == char)) 102 { 103 auto w = chain.window; 104 // ends on a multi-char sequence. Ensure it's valid. 105 // find the encoding 106 ee_outer: 107 foreach(ubyte i; 1 .. 4) 108 { 109 import core.bitop : bsr; 110 if(w.length < i) 111 { 112 // either no data, or invalid sequence. 113 if(i > 1) 114 { 115 // TODO: throw some error? 116 } 117 partial = 0; 118 break ee_outer; 119 } 120 immutable highestBit = bsr(~cast(uint)w[$ - i] & 0x0ff); 121 switch(highestBit) 122 { 123 case 7: 124 // ascii character 125 if(i > 1) 126 { 127 // TODO: throw some error? 128 } 129 partial = 0; 130 break ee_outer; 131 case 6: 132 // need to continue looking 133 break; 134 case 3: .. case 5: 135 // 5 -> 2 byte sequence 136 // 4 -> 3 byte sequence 137 // 3 -> 4 byte sequence 138 if(i + highestBit == 7) 139 // complete sequence, let it pass. 140 partial = 0; 141 else 142 // skip these, the whole sequence isn't there yet. 143 partial = i; 144 break ee_outer; 145 default: 146 // invalid sequence, let it fail 147 // TODO: throw some error? 148 partial = 0; 149 break ee_outer; 150 } 151 } 152 } 153 else // wchar 154 { 155 // if the last character is in 0xD800 - 0xDBFF, then it is 156 // the first wchar of a surrogate pair. This means we must 157 // leave it off the end. 158 partial = chain.window.length > 0 && (chain.window[$-1] & 0xFC00) == 0xD800 ? 1 : 0; 159 } 160 } 161 size_t extend(size_t elements) 162 { 163 auto origWindowSize = window.length; 164 cast(void)chain.extend(elements > partial ? elements - partial : elements); 165 determinePartial(); 166 // TODO: may need to loop if we are getting one char at a time. 167 return window.length - origWindowSize; 168 } 169 170 mixin implementValve!chain; 171 } 172 173 /** 174 * Wraps a text-based iopipe to make sure all code units are decodeable. 175 * 176 * When an iopipe is made up of character types, in some cases a slice of the 177 * window may not be completely decodeable. For example, a wchar iopipe may 178 * have only one half of a surrogate pair at the end of the window. 179 * 180 * This function generates an iopipe that only allows completely decodeable 181 * sequences to be released to the next iopipe. 182 * 183 * Params: 184 * c = The iopipe whose element type is one of char, wchar, or dchar. 185 * 186 * Returns: 187 * An appropriate iopipe that ensures decodeability. Note that dchar iopipes 188 * are always decodeable, so the result is simply a return of the input. 189 */ 190 auto ensureDecodeable(Chain)(Chain c) if (isIopipe!Chain && isSomeChar!(ElementEncodingType!(WindowType!Chain))) 191 { 192 import std.traits: Unqual; 193 alias CodeUnitType = Unqual!(ElementEncodingType!(WindowType!Chain)); 194 195 // need to stop chaining if the last thing was an ensureDecodable. Of 196 // course, it's very hard to check if the type is a DecodeableWindow. What 197 // we do is pretend to wrap c's upstream chain, and see if it results in 198 // the exact type we were passed. If this is the case, then it must be a 199 // type that was wrapped with a DecodableWindow. 200 static if(is(CodeUnitType == dchar)) 201 { 202 // always decodeable 203 return c; 204 } 205 else static if(__traits(hasMember, Chain, "chain") && 206 is(typeof(.ensureDecodeable(c.chain)) == Chain)) 207 { 208 return c; 209 } 210 else 211 { 212 auto r = DecodeableWindow!(Chain, CodeUnitType)(c); 213 r.determinePartial(); 214 return r; 215 } 216 } 217 218 @safe unittest 219 { 220 // check that ensureDecodeable just returns itself when called twice 221 auto str = "hello"; 222 auto d1 = str.ensureDecodeable; 223 auto d2 = d1.ensureDecodeable; 224 static assert(is(typeof(d1) == typeof(d2))); 225 } 226 227 /** 228 * Given an ioPipe whose window is a buffer that is a dynamic array of data of 229 * integral type, performs the proper transformations in order to get a buffer 230 * of valid char, wchar, or dchar elements, depending on the provided encoding. 231 * This function is useful for when you have data from a raw source (such as a 232 * file or stream) that you have determined or know is really a stream of UTF 233 * data. 234 * 235 * If the data must be byte-swapped, then it must be mutable. Otherwise, 236 * immutable or const data is allowed. 237 * 238 * Params: 239 * enc = The assumed encoding of the text pipe. 240 * c = The chain to assume the encoding for. This MUST have a dynamic 241 * array type for its window, and the elements must be integral. 242 * Returns: 243 * An appropriate iopipe that has a window of the appropriate character type 244 * (`char`, `wchar`, or `dchar`) for the assumed encoding. The window will 245 * be set up so its elements are properly byte-ordered for the compiled 246 * platform. 247 */ 248 auto assumeText(UTFType enc = UTFType.UTF8, Chain)(Chain c) if (isIopipe!Chain && isDynamicArray!(WindowType!Chain) && isIntegral!(ElementEncodingType!(WindowType!Chain))) 249 { 250 static if(enc == UTFType.UTF8 || enc == UTFType.Unknown) 251 return c.arrayCastPipe!char; 252 else static if(enc == UTFType.UTF16LE || enc == UTFType.UTF32LE) 253 { 254 return c.arrayCastPipe!(CodeUnit!enc).byteSwapper!true; 255 } 256 else static if(enc == UTFType.UTF16BE || enc == UTFType.UTF32BE) 257 { 258 return c.arrayCastPipe!(CodeUnit!enc).byteSwapper!false; 259 } 260 else 261 static assert(0); 262 } 263 264 @safe unittest 265 { 266 import std.algorithm : equal; 267 import core.bitop : bswap; 268 // standard char array, casted to ubyte (typical case) 269 ubyte[] str1 = ['h', 'e', 'l', 'l', 'o']; 270 271 auto p1 = str1.assumeText!(UTFType.UTF8); 272 static assert(is(WindowType!(typeof(p1)) == char[])); 273 assert("hello".equal(p1.window)); 274 275 // build a byte-swapped array for "hello" 276 uint[] str2 = ['h', 'e', 'l', 'l', 'o']; 277 foreach(ref i; str2) 278 i = bswap(i); 279 280 // encoding should be utf32, in non-native endianness. 281 version(BigEndian) 282 { 283 enum encType = UTFType.UTF32LE; 284 } 285 else 286 { 287 enum encType = UTFType.UTF32BE; 288 } 289 290 auto p2 = str2.assumeText!encType; 291 static assert(is(WindowType!(typeof(p2)) == dchar[])); 292 assert("hello".equal(p2.window)); 293 } 294 295 private struct DelimitedTextPipe(Chain) 296 { 297 alias CodeUnitType = Unqual!(typeof(Chain.init.window[0])); 298 private 299 { 300 Chain chain; 301 size_t checked; 302 size_t _segments; 303 bool endsWithDelim; 304 CodeUnitType[dchar.sizeof / CodeUnitType.sizeof] delimElems; 305 306 static if(is(CodeUnitType == dchar)) 307 { 308 enum validDelimElems = 1; 309 enum skippableElems = 1; 310 } 311 else 312 { 313 // number of elements in delimElems that are valid 314 ubyte validDelimElems; 315 // number of elements that can be skipped if the sequence fails 316 // to match. This basically is the number of elements that are 317 // not the first element (except the first element of course). 318 ubyte skippableElems; 319 } 320 } 321 322 auto window() { return chain.window[0 .. checked]; } 323 ubyte delimTrailer() { return endsWithDelim ? validDelimElems : 0; } 324 void release(size_t elements) 325 { 326 checked -= elements; 327 chain.release(elements); 328 } 329 330 size_t extend(size_t elements = 0) 331 { 332 auto newChecked = checked; 333 endsWithDelim = false; 334 const ve = validDelimElems; 335 { 336 // scan for first delimiter element 337 byline_outer_1: 338 do 339 { 340 // make sure we don't even get into a check if we don't have enough elements to check. 341 auto w = chain.window; 342 if(newChecked + ve > w.length) 343 continue; 344 immutable t = delimElems[0]; 345 static if(isDynamicArray!(WindowType!(Chain))) 346 { 347 if(__ctfe) 348 { 349 // don't use pointer tricks or memchr 350 ctfe_while: 351 while(newChecked + ve <= w.length) 352 { 353 if(w[newChecked++] == t) 354 { 355 // found first element, look for the others 356 foreach(i; 1 .. ve) 357 if(w[newChecked] != delimElems[i]) 358 continue ctfe_while; 359 else 360 ++newChecked; 361 endsWithDelim = true; 362 break byline_outer_1; 363 } 364 } 365 continue; 366 } 367 368 // search for the first delimiter element using @system methods 369 bool search() @trusted 370 { 371 auto p = w.ptr + newChecked; 372 static if(CodeUnitType.sizeof == 1) 373 { 374 // can use memchr 375 import core.stdc.string: memchr; 376 // should be true because we check at the beginning of the loop. 377 assert(newChecked + (ve - 1) <= w.length); 378 auto delimp = cast(typeof(p))memchr(p, t, w.length - newChecked - (ve - 1)); 379 if(delimp != null) 380 { 381 // found it 382 newChecked = delimp + 1 - w.ptr; 383 return true; 384 } 385 } 386 else 387 { 388 auto e = w.ptr + w.length - (ve - 1); 389 while(p < e) 390 { 391 if(*p++ == t) 392 { 393 // found it 394 newChecked = p - w.ptr; 395 return true; 396 } 397 } 398 } 399 return false; 400 } 401 402 if(search()) 403 { 404 // found the first delimeter element. If multiple exist, 405 // we need to check those as well. 406 if(ve != 1) 407 { 408 size_t i = 1; 409 while(i < ve) 410 { 411 // TODO: should we optimize this? 412 if(w[newChecked] != delimElems[i]) 413 break; 414 ++newChecked; 415 ++i; 416 } 417 if(i == ve) 418 { 419 endsWithDelim = true; 420 break byline_outer_1; 421 } 422 } 423 else 424 { 425 endsWithDelim = true; 426 break byline_outer_1; 427 } 428 } 429 else 430 newChecked = w.length - (ve - 1); 431 } 432 else 433 { 434 generic_range_while: 435 while(newChecked + ve <= w.length) 436 { 437 if(w[newChecked++] == t) 438 { 439 // found first element, look for the others 440 foreach(i; 1 .. ve) 441 if(w[newChecked] != delimElems[i]) 442 continue generic_range_while; 443 else 444 ++newChecked; 445 endsWithDelim = true; 446 break byline_outer_1; 447 } 448 } 449 } 450 } while(chain.extend(elements) != 0); 451 452 if(!endsWithDelim) 453 { 454 // ran out of data 455 newChecked = chain.window.length; 456 } 457 } 458 459 auto prevChecked = checked; 460 if(checked != newChecked) 461 { 462 ++_segments; 463 checked = newChecked; 464 } 465 return checked - prevChecked; 466 } 467 468 size_t segments() { return _segments; } 469 470 mixin implementValve!chain; 471 } 472 473 /** 474 * Process a given text iopipe by a given code point delimeter. The only 475 * behavior that changes from the input pipe is that extensions to the window 476 * deliever exactly one more delimited segment of text. 477 * 478 * Params: 479 * c = The input text iopipe. This must have a window whose elements are 480 * valid character types. 481 * delim = The code point with which to delimit the text. Each extension to 482 * the iopipe will either end on this delimiter, or will be the last 483 * segment in the pipe. 484 * Returns: 485 * An iopipe that behaves as described above. 486 */ 487 auto delimitedText(Chain)(Chain c, dchar delim = '\n') 488 if(isIopipe!Chain && 489 isSomeChar!(ElementEncodingType!(WindowType!Chain))) 490 { 491 import std.traits: Unqual; 492 auto result = DelimitedTextPipe!(Chain)(c); 493 // set up the delimeter 494 static if(is(result.CodeUnitType == dchar)) 495 { 496 result.delimElems[0] = delim; 497 } 498 else 499 { 500 import std.utf: encode; 501 result.validDelimElems = cast(ubyte)encode(result.delimElems, delim); 502 result.skippableElems = 1; // need to be able to skip at least one element 503 foreach(x; result.delimElems[1 .. result.validDelimElems]) 504 { 505 if(x == result.delimElems[0]) 506 break; 507 ++result.skippableElems; 508 } 509 } 510 return result; 511 } 512 513 @safe unittest 514 { 515 static void testIt(X)(X p) 516 { 517 p.extend; 518 assert(p.window == "hello "); 519 p.extend; 520 assert(p.window == "hello world, "); 521 p.extend; 522 assert(p.window == "hello world, this "); 523 assert(p.segments == 3); 524 assert(p.delimTrailer == 1); 525 p.process(); 526 assert(p.segments == 6); 527 assert(p.delimTrailer == 0); 528 } 529 testIt("hello world, this is a test".delimitedText(' ')); 530 // bug #32 531 testIt(SimplePipe!string("hello world, this is a test").delimitedText(' ')); 532 533 // add valve support 534 import iopipe.valve; 535 auto p2 = "hello world".simpleValve.delimitedText(' '); 536 auto orig = p2.valve; 537 assert(orig == "hello world"); 538 } 539 540 /** 541 * A convenience wrapper for delimitedText that uses the newline character '\n' 542 * to delimit the segments. Equivalent to `delimitedText(c, '\n');` 543 * 544 * Params: 545 * c = The input text iopipe. This must have a window whose elements are 546 * valid character types. 547 * Returns: 548 * A line delimited iopipe. 549 */ 550 auto byLine(Chain)(Chain c) 551 { 552 return delimitedText(c, '\n'); 553 } 554 555 // same as a normal range, but we don't return the delimiter. 556 // Note that the Chain MUST be a ByDelim iopipe. 557 private struct NoDelimRange(Chain) 558 { 559 Chain chain; 560 ubyte delimElems; 561 bool empty() { return chain.window.length == 0; } 562 auto front() { return chain.window[0 .. $ - delimElems]; } 563 void popFront() 564 { 565 chain.release(chain.window.length); 566 chain.extend(0); 567 delimElems = chain.delimTrailer; 568 } 569 } 570 571 /** 572 * Given a text iopipe, returns a range based on splitting the text by a given 573 * code point. This has the advantage over `delimitedText.asRange` in that the 574 * delimiter can be hidden. 575 * 576 * Params: 577 * KeepDelimiter = If true, then the delimiter is included in each element 578 * of the range (if present from the original iopipe). 579 * c = The iopipe to range-ify. 580 * delim = The dchar to use for delimiting. 581 * Returns: 582 * An input range whose elements are the delimited text segments, with or 583 * without delimiters as specified by the KeepDelimiter boolean. 584 */ 585 586 auto byDelimRange(bool KeepDelimiter = false, Chain)(Chain c, dchar delim) 587 if(isIopipe!Chain && 588 is(Unqual!(ElementType!(WindowType!Chain)) == dchar)) 589 { 590 auto p = c.delimitedText(delim); 591 static if(KeepDelimiter) 592 { 593 // just use standard input range adapter 594 return p.asInputRange; 595 } 596 else 597 { 598 auto r = NoDelimRange!(typeof(p))(p); 599 // pre-fetch first line 600 r.popFront(); 601 return r; 602 } 603 } 604 605 /** 606 * Convenience wrapper for byDelimRange that uses the newline character '\n' as 607 * the delimiter. Equivalent to `byDelimRange!(KeepDelimiter)(c, '\n'); 608 * 609 * Params: 610 * KeepDelimiter = If true, then the delimiter is included in each element 611 * of the range (if present from the original iopipe). 612 * c = The iopipe to range-ify. 613 * Returns: 614 * An input range whose elements are lines of text from the input iopipe, 615 * with or without delimiters as specified by the KeepDelimiter boolean. 616 */ 617 618 auto byLineRange(bool KeepDelimiter = false, Chain)(Chain c) 619 { 620 return byDelimRange!(KeepDelimiter)(c, '\n'); 621 } 622 623 @safe unittest 624 { 625 import std.algorithm : equal; 626 assert("hello\nworld".byLineRange.equal(["hello", "world"])); 627 assert("hello\nworld".byLineRange!true.equal(["hello\n", "world"])); 628 assert("\nhello\nworld".byLineRange.equal(["", "hello", "world"])); 629 assert("\nhello\nworld".byLineRange!true.equal(["\n", "hello\n", "world"])); 630 assert("\nhello\nworld\n".byLineRange.equal(["", "hello", "world"])); 631 assert("\nhello\nworld\n".byLineRange!true.equal(["\n", "hello\n", "world\n"])); 632 } 633 634 static struct TextOutput(Chain) 635 { 636 Chain chain; 637 alias CT = typeof(Chain.init.window[0]); 638 639 // TODO: allow putting of strings 640 641 void put(A)(A c) 642 { 643 import std.utf; 644 static if(A.sizeof == CT.sizeof) 645 { 646 // output the data directly to the output stream 647 if(chain.ensureElems(1) == 0) 648 assert(0); 649 chain.window[0] = c; 650 chain.release(1); 651 } 652 else 653 { 654 static if(is(CT == char)) 655 { 656 static if(is(A : const(wchar))) 657 { 658 // A is a wchar. Make sure it's not a surrogate pair 659 // (that it's a valid dchar) 660 if(!isValidDchar(c)) 661 assert(0); 662 } 663 // convert the character to utf8 664 if(c <= 0x7f) 665 { 666 if(chain.ensureElems(1) == 0) 667 assert(0); 668 chain.window[0] = cast(char)c; 669 chain.release(1); 670 } 671 else 672 { 673 char[4] buf = void; 674 auto idx = 3; 675 auto mask = 0x3f; 676 dchar c2 = c; 677 while(c2 > mask) 678 { 679 buf[idx--] = 0x80 | (c2 & 0x3f); 680 c2 >>= 6; 681 mask >>= 1; 682 } 683 buf[idx] = (c2 | (~mask << 1)) & 0xff; 684 const elems = buf.length - idx; 685 if(chain.ensureElems(elems) < elems) 686 assert(0); 687 chain.window[0 .. elems] = buf[idx .. $]; 688 chain.release(elems); 689 } 690 } 691 else static if(is(CT == wchar)) 692 { 693 static if(is(A : const(char))) 694 { 695 // this is a utf-8 character, only works if it's an 696 // ascii character 697 if(c > 0x7f) 698 throw new Exception("invalid character output"); 699 } 700 // convert the character to utf16 701 assert(isValidDchar(c)); 702 if(c < 0xFFFF) 703 { 704 if(chain.ensureElems(1) == 0) 705 assert(0); 706 chain.window[0] = cast(wchar)c; 707 chain.release(1); 708 } 709 else 710 { 711 if(chain.ensureElems(2) < 2) 712 assert(0); 713 wchar[2] buf = void; 714 dchar dc = c - 0x10000; 715 buf[0] = cast(wchar)(((dc >> 10) & 0x3FF) + 0xD800); 716 buf[1] = cast(wchar)((dc & 0x3FF) + 0xDC00); 717 chain.window[0..2] = buf; 718 chain.release(2); 719 } 720 } 721 else static if(is(CT == dchar)) 722 { 723 static if(is(A : const(char))) 724 { 725 // this is a utf-8 character, only works if it's an 726 // ascii character 727 if(c > 0x7f) 728 throw new Exception("invalid character output"); 729 } 730 else static if(is(A : const(wchar))) 731 { 732 // A is a wchar. Make sure it's not a surrogate pair 733 // (that it's a valid dchar) 734 if(!isValidDchar(c)) 735 throw new Exception("invalid character output"); 736 } 737 // converting to utf32, just write directly 738 if(chain.ensureElems(1) == 0) 739 assert(0); 740 chain.window[0] = c; 741 chain.release(1); 742 } 743 else 744 static assert(0, "invalid types used for output stream, " ~ CT.stringof ~ ", " ~ C.stringof); 745 } 746 } 747 } 748 749 /** 750 * Take a text-based iopipe and turn it into an output range of `dchar`. Note 751 * that the iopipe must be an output iopipe, not an input one. In other words, 752 * a `textOutput` result doesn't output its input, it uses its input as a place 753 * to deposit data. 754 * 755 * The given iopipe window will be written to, then data that is ready to be 756 * output is released. It is expected that the iopipe will use this mechanism 757 * to actually know which data to output. See the example for more information. 758 * 759 * Params: 760 * c = The output iopipe that can be used to put dchars into. 761 * Returns: 762 * An output range that can accept all forms of text data for output. 763 */ 764 auto textOutput(Chain)(Chain c) 765 { 766 // create an output range of dchar/code units around c. We assume releasing and 767 // extending c will properly output the data. 768 769 return TextOutput!Chain(c); 770 } 771 772 /// 773 @safe unittest 774 { 775 import std.range : put; 776 // use a writeable buffer as output. 777 char[256] buffer; 778 size_t written = 0; 779 780 // this helps us see how many chars are written. 781 struct LocalIopipe 782 { 783 char[] window; 784 void release(size_t elems) 785 { 786 window.release(elems); 787 written += elems; 788 } 789 size_t extend(size_t elems) { return 0; } 790 } 791 auto oRange = LocalIopipe(buffer[]).textOutput; 792 put(oRange, "hello, world"); 793 794 // written is updated whenever the iopipe is released 795 assert(buffer[0 .. written] == "hello, world"); 796 } 797 798 /** 799 * Convert iopipe of one text type into an iopipe for another type. Performs 800 * conversions at the code-point level. If specified, the resulting iopipe will 801 * ensure there is a BOM at the beginning of the iopipe. This is useful if 802 * writing to storage. 803 * 804 * If no conversion is necessary, and no BOM is required, the original iopipe 805 * is returned. 806 * 807 * Params: 808 * Char = The desired character type in the resulting iopipe. Must be one 809 * of char, wchar, or dchar. 810 * ensureBOM = If true, the resulting iopipe will ALWAYS have a byte order 811 * mark at the beginning of the stream. At the moment this is 812 * accomplished by copying all the data from the original iopipe to 813 * the new one. A better mechanism is being worked on. 814 * chain = The source iopipe. 815 * Returns: 816 * An iopipe which fulfills the given requirements. 817 * 818 */ 819 auto convertText(Char = char, bool ensureBOM = false, Chain)(Chain chain) if (isSomeChar!Char) 820 { 821 static if(!ensureBOM && is(ElementEncodingType!(WindowType!(Chain)) == Char)) 822 return chain; 823 else 824 return chain.textConverter!ensureBOM.bufd!Char; 825 } 826 827 @safe unittest 828 { 829 // test converting char[] to wchar[] 830 auto inpipe = "hello"; 831 immutable(ushort)[] expected = cast(immutable(ushort)[])"\ufeffhello"w; 832 auto wpipe = inpipe.convertText!wchar; 833 static assert(is(WindowType!(typeof(wpipe)) == wchar[])); 834 835 wpipe.extend(100);// fill the pipe 836 assert(wpipe.window.length == 5); 837 assert(cast(ushort[])wpipe.window == expected[1 .. $]); 838 839 // ensure the BOM 840 auto wpipe2 = inpipe.convertText!(wchar, true); 841 wpipe2.extend(100); 842 assert(wpipe2.window.length == 6); 843 assert(cast(ushort[])wpipe2.window == expected); 844 } 845 846 847 /** 848 * A converter to allow conversion into any other type of text. 849 * 850 * The converter does 2 things. First and foremost, it adds a read function 851 * that allows conversion into any other width of text. The read function 852 * converts as much text as possible into the given format, extending the base 853 * iopipe as necessary. 854 * 855 * The second thing that it does is potentially add a BOM character to the 856 * beginning of the text. It was decided to add this here, since you are likely 857 * already copying data from one iopipe into another. However, in future 858 * versions, this capability may go away, as we can do this elsewhere with less 859 * copying. So expect this API to change. 860 */ 861 template textConverter(bool ensureBOM = false, Chain) 862 { 863 struct TextConverter 864 { 865 Chain chain; 866 static if(ensureBOM) 867 { 868 bool atBeginning = true; 869 870 auto release(size_t elems) 871 { 872 atBeginning = atBeginning && elems == 0; 873 return chain.release(elems); 874 } 875 } 876 877 size_t read(Char)(Char[] buf) 878 { 879 alias SrcChar = ElementEncodingType!(WindowType!(Chain)); 880 if(buf.length == 0) 881 return 0; 882 // first step, check to see if the first code point is a BOM 883 size_t result = 0; 884 static if(ensureBOM) 885 { 886 if(atBeginning) 887 { 888 // utf8 bom is 3 code units, in other char types, it's only 1. 889 bool addBOM = true; 890 static if(is(Unqual!SrcChar == char)) 891 { 892 if(chain.window.length < 3) 893 chain.extend(0); 894 if(chain.window.length == 0) 895 return 0; // special case, don't insert a BOM for a blank file. 896 if(chain.window.length >= 3 && 897 chain.window[0] == 0xef && 898 chain.window[1] == 0xbb && 899 chain.window[2] == 0xbf) 900 { 901 addBOM = false; 902 } 903 } 904 else 905 { 906 if(chain.window.length < 1) 907 if(chain.extend(0) == 0) 908 return 0; // special case, don't insert a BOM for a blank file. 909 if(chain.window[0] == 0xfeff) 910 addBOM = false; 911 } 912 913 if(addBOM) 914 { 915 // write the BOM to the given buffer 916 static if(is(Char == char)) 917 { 918 buf[0] = 0xef; 919 buf[1] = 0xbb; 920 buf[2] = 0xbf; 921 922 result = 3; 923 buf = buf[3 .. $]; 924 } 925 else 926 { 927 buf[0] = 0xfeff; 928 result = 1; 929 buf = buf[1 .. $]; 930 } 931 } 932 } 933 } 934 static if(is(Unqual!Char == Unqual!SrcChar)) 935 { 936 import std.algorithm.mutation: copy; 937 import std.algorithm.comparison: max; 938 // try an extend when window length gets to be less than read size. 939 if(chain.window.length < buf.length) 940 chain.extend(buf.length - chain.window.length); 941 if(chain.window.length == 0) 942 // no more data 943 return 0; 944 immutable len = max(chain.window.length, buf.length); 945 copy(chain.window[0 .. len], buf[0 .. len]); 946 chain.release(len); 947 return result + len; 948 } 949 else 950 { 951 // need to transcode each code point. 952 import std.utf; 953 auto win = chain.window; 954 size_t pos = 0; 955 bool didExtend = false; 956 bool eof = false; 957 while(buf.length > 0) 958 { 959 enum minValidElems = is(Unqual!Char == dchar) ? 1 : 4; 960 if(!eof && pos + minValidElems > chain.window.length) 961 { 962 if(!didExtend) 963 { 964 didExtend = true; 965 // give the upstream pipe some buffer space 966 chain.release(pos); 967 pos = 0; 968 if(chain.extend(0)) 969 { 970 win = chain.window; 971 continue; 972 } 973 win = chain.window; 974 // else, we aren't going to get any more data. decode as needed. 975 eof = true; 976 } 977 else 978 // don't decode any more. We can wait until next time. 979 break; 980 } 981 if(pos == win.length) 982 // end of the stream 983 break; 984 // decode a code point 985 auto oldPos = pos; 986 dchar dc; 987 dc = decode(win, pos); 988 // encode the dchar into a new item 989 Char[dchar.sizeof / Char.sizeof] encoded; 990 auto nChars = encode(encoded, dc); 991 if(nChars > buf.length) 992 { 993 // read as much as we could. 994 pos = oldPos; 995 break; 996 } 997 if(nChars == 1) 998 buf[0] = encoded[0]; 999 else 1000 buf[0 .. nChars] = encoded[0 .. nChars]; 1001 result += nChars; 1002 buf = buf[nChars .. $]; 1003 } 1004 1005 // release the chain data that we have processed. 1006 chain.release(pos); 1007 return result; 1008 } 1009 } 1010 alias chain this; 1011 } 1012 1013 auto textConverter(Chain c) 1014 { 1015 return TextConverter(c); 1016 } 1017 } 1018 1019 /** 1020 * Encode a given text iopipe into the desired encoding type. The resulting 1021 * iopipe's element type is ubyte, with the bytes ready to be written to a 1022 * storage device. 1023 * 1024 * Params: 1025 * enc = The encoding type to use. 1026 * c = The source iopipe. Must be an iopipe where the window type's element 1027 * type is text based. 1028 * Returns: 1029 * A ubyte iopipe that represents the encoded version of the input iopipe 1030 * based on the provided encoding. 1031 */ 1032 auto encodeText(UTFType enc = UTFType.UTF8, Chain)(Chain c) 1033 { 1034 auto converted = c.convertText!(CodeUnit!enc); 1035 1036 static if(enc == UTFType.UTF8) 1037 { 1038 return converted.arrayCastPipe!ubyte; 1039 } 1040 else static if(enc == UTFType.UTF16LE || enc == UTFType.UTF32LE) 1041 { 1042 return converted.byteSwapper!(true).arrayCastPipe!ubyte; 1043 } 1044 else static if(enc == UTFType.UTF16BE || enc == UTFType.UTF32BE) 1045 { 1046 return converted.byteSwapper!(false).arrayCastPipe!ubyte; 1047 } 1048 else 1049 assert(0); 1050 } 1051 1052 @safe unittest 1053 { 1054 import core.bitop : bswap; 1055 // ensure that we properly byteswap. 1056 auto input = "hello"; 1057 1058 version(BigEndian) 1059 enum encodingType = UTFType.UTF32LE; 1060 else 1061 enum encodingType = UTFType.UTF32BE; 1062 1063 auto testme = input.encodeText!encodingType; 1064 static assert(is(WindowType!(typeof(testme)) == ubyte[])); 1065 1066 uint[] expected = cast(uint[])"hello"d.dup; 1067 foreach(ref v; expected) v = bswap(v); 1068 1069 testme.extend(100); 1070 assert(testme.window == cast(ubyte[])expected); 1071 } 1072 1073 /** 1074 * Given a template function, and an input chain of encoded text data, this 1075 * function will detect the encoding of the input chain, and convert that 1076 * runtime value into a compile-time parameter to the given function. Useful 1077 * for writing code that needs to handle all the forms of text encoding. 1078 * 1079 * Use the encoding type as a parameter to assumeText to get an iopipe of 1080 * `char`, `wchar`, or `dchar` elements for processing. 1081 * 1082 * Note that func must return the same type no matter how it's called, as the 1083 * BOM detection and calling is done at runtime. Given that there are 5 1084 * different encodings that iopipe handles, you will have 6 instantiations of 1085 * the function, no matter whether the input contains that encoding or not. 1086 * 1087 * The second version assumes that the function doesn't care what the encoding 1088 * is, but just wants to get a text iopipe with the appropriate encoding 1089 * already handled. In this case, the function will receive a chain of `char`, 1090 * `wchar`, or `dchar` window elements. 1091 * 1092 * Params: 1093 * func = The template function to call. 1094 * UnknownIsUTF8 = If true, then an undetected encoding will be passed as 1095 * UTF8 to your function. Otherwise, the Unknown encoding will be passed. 1096 * c = The iopipe input chain that should have encoded text in it. 1097 * args = Any optional args to pass to the function. 1098 * Returns: 1099 * The return value from func. 1100 */ 1101 auto ref runWithEncoding(alias func, bool UnknownIsUTF8 = true, Chain, Args...)(Chain c, auto ref Args args) 1102 if(isIopipe!Chain && is(typeof(detectBOM(c.window)))) 1103 { 1104 // first, detect the encoding 1105 c.ensureElems(4); 1106 import std.traits: EnumMembers; 1107 auto bom = c.window.detectBOM; 1108 final switch(bom) 1109 { 1110 // BUG: static foreach should work, but doesn't, waiting for issue 17807 to 1111 // make it into a release. 1112 /*static*/ foreach(enc; EnumMembers!UTFType) 1113 { 1114 case enc: 1115 static if(UnknownIsUTF8 && enc == UTFType.Unknown) 1116 goto case UTFType.UTF8; 1117 else 1118 return func!(enc)(c, args); 1119 } 1120 } 1121 } 1122 1123 /// Ditto 1124 auto ref runEncoded(alias func, Chain, Args...)(Chain c, auto ref Args args) 1125 { 1126 static auto ref forwarder(UTFType enc)(Chain c, auto ref Args args) 1127 { 1128 return func(c.assumeText!enc, args); 1129 } 1130 1131 return runWithEncoding!forwarder(c, args); 1132 } 1133 1134 // TODO: need unit tests here. 1135 // 1136 1137 @safe nothrow unittest 1138 { 1139 // try running byline range with a string in CTFE 1140 import std.range : walkLength; 1141 static void testit(Char)() 1142 { 1143 enum immutable(Char)[] str = "hello\nworld\nthis\nis\na\ntest"; 1144 static assert(str.byLineRange.walkLength == 6); 1145 } 1146 1147 testit!char(); 1148 testit!wchar(); 1149 testit!dchar(); 1150 }