1 <?php 2 /* Copyright (c) 2012, Geert Bergman (geert@scrivo.nl) 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright notice, 11 * this list of conditions and the following disclaimer in the documentation 12 * and/or other materials provided with the distribution. 13 * 3. Neither the name of "Scrivo" nor the names of its contributors may be 14 * used to endorse or promote products derived from this software without 15 * specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 21 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 * POSSIBILITY OF SUCH DAMAGE. 28 * 29 * $Id: String.php 841 2013-08-19 22:19:47Z geert $ 30 */ 31 32 /** 33 * Implementation of the \Scrivo\String class. 34 */ 35 36 namespace Scrivo; 37 38 /** 39 * Wrapper class for PHP strings to enforce consistent and safe multi-byte 40 * (UTF-8) string handling. 41 * 42 * \Scrivo\String is a primitive wrapper class for PHP strings to make sure that 43 * all operations performed on the string are UTF-8 safe. As PHP does not 44 * enforce a consistent way to deal with multibyte strings we do it 45 * ourselves. In the Scrivo code base UTF-8 is the only encoding that is 46 * supported for operations on data and these operations should be done 47 * through instances of the \Scrivo\String class. If strings are used as byte 48 * arrays, use the ByteArray class. 49 * 50 * \Scrivo\String objects are imutable: once created you can't change them. All 51 * operations on a \Scrivo\String object will return a new \Scrivo\String object. 52 * 53 * Although we'll be working with UTF-8 exclusively it is possible to create 54 * \Scrivo\String objects that contain characters from 8 byte encoding schemes. 55 * Also a note on HTML entities, we work with UTF-8 so you don't need them: 56 * they are evil. Except entities for the reserved HTML characters (<>&'") 57 * there is really no use for them in UTF-8 strings. And when stored in a 58 * database only cause sorting and lookup errors. Therefore when construction 59 * \Scrivo\String objects you can opt to convert existing HTML entities to their 60 * corresonding UTF-8 characters. 61 * 62 * The current locale setting for LC_COLLATE is important. 63 * \Scrivo\String::compareTo() will use this setting when comparing strings. 64 * 65 * Please note: you might be tempted to do string comparison using 66 * equality operators (==). Although this works in most cases don't do this: 67 * you'll do PHP object comparison (i.e. comparing a 68 * \Scrivo\String object) and that is not what you want: use \Scrivo\String::equals() 69 * or \Scrivo\String::compareTo() to compare strings. 70 */ 71 class String implements \Iterator, \ArrayAccess, \Countable { 72 73 /** 74 * Constant to denote ISO-8859-1 encoding. This is the default encoding 75 * for \Scrivo\String uses for fixing and comparing. 76 */ 77 const ENC_ISO_8859_1 = "ISO-8859-1"; 78 79 /** 80 * Constant to denote CP-1251 encoding. 81 */ 82 const ENC_CP_1251 = "CP-1251"; 83 84 /** 85 * Constant to indicate that you don't want to decode any entities when 86 * constructing the string. 87 */ 88 const DECODE_NONE = 0; 89 90 /** 91 * Constant to indicate that you want to decode all entities when 92 * constructing the string. 93 */ 94 const DECODE_ALL = 1; 95 96 /** 97 * Constant to indicate that you want to decode all but the entities for 98 * reserved characters (&<>'") when constructing the string. 99 */ 100 const DECODE_UNRESERVED = 2; 101 102 /** 103 * The primitive UTF-8 string. 104 * @var string 105 */ 106 private $str; 107 108 /** 109 * The current position when iterating. 110 * @var string 111 */ 112 private $pos; 113 114 /** 115 * The length of the string (characters not bytes). 116 * @var int 117 */ 118 private $len = -1; 119 120 /** 121 * Collator used for sorting. This is a static shared amongst instances. 122 * @var \Collator 123 */ 124 private static $coll; 125 126 /** 127 * Map to translate 8 byte code page characters to UTF-8 sequences. 128 * @var array[] 129 */ 130 private static $maps = array( 131 self::ENC_ISO_8859_1 => array(128 => 132 "€","�","‚","ƒ","„","…","†","‡","ˆ","‰","Š","‹","Œ","�","Ž","�", 133 "�","‘","’","“","”","•","–","—","˜","™","š","›","œ","�","ž","Ÿ", 134 " ","¡","¢","£","¤","¥","¦","§","¨","©","ª","«","¬","","®","¯", 135 "°","±","²","³","´","µ","¶","·","¸","¹","º","»","¼","½","¾","¿", 136 "À","Á","Â","Ã","Ä","Å","Æ","Ç","È","É","Ê","Ë","Ì","Í","Î","Ï", 137 "Ð","Ñ","Ò","Ó","Ô","Õ","Ö","×","Ø","Ù","Ú","Û","Ü","Ý","Þ","ß", 138 "à","á","â","ã","ä","å","æ","ç","è","é","ê","ë","ì","í","î","ï", 139 "ð","ñ","ò","ó","ô","õ","ö","÷","ø","ù","ú","û","ü","ý","þ","ÿ", 140 ), 141 self::ENC_CP_1251 => array(128 => 142 "Ђ","Ѓ","‚","ѓ","„","…","†","‡","€","‰","Љ","‹","Њ","Ќ","Ћ","Џ", 143 "ђ","‘","’","“","”","•","–","—","�","™","љ","›","њ","ќ","ћ","џ", 144 " ","Ў","ў","Ј","¤","Ґ","¦","§","Ё","©","Є","«","¬","","®","Ї", 145 "°","±","І","і","ґ","µ","¶","·","ё","№","є","»","ј","Ѕ","ѕ","ї", 146 "А","Б","В","Г","Д","Е","Ж","З","И","Й","К","Л","М","Н","О","П", 147 "Р","С","Т","У","Ф","Х","Ц","Ч","Ш","Щ","Ъ","Ы","Ь","Э","Ю","Я", 148 "а","б","в","г","д","е","ж","з","и","й","к","л","м","н","о","п", 149 "р","с","т","у","ф","х","ц","ч","ш","щ","ъ","ы","ь","э","ю","я", 150 ), 151 ); 152 153 /** 154 * Test if a given byte sequence is a valid UTF-8 sequence. 155 * 156 * If the tested byte sequence is a valid UTF-8 sequence the method 157 * returns the length of the sequence, else the method returns 0. 158 * 159 * @param string $seq The byte sequence to test. 160 * 161 * @return int The length of the UTF-8 sequence (2-4) or 0 if the 162 * sequence is not an UTF-8 sequence. 163 */ 164 private function isUtf8Sequence($seq) { 165 166 // check if the first byte is a UTF-8 marker and if not end it here 167 $b1 = ord($seq[0]); 168 if ($b1<0xC2 || $b1>=0xF5) { 169 return 0; 170 } 171 172 // get the length to prevent overshooting when reading 173 $len = strlen($seq); 174 175 // Get the 2nd and 3rd byte and test it, note that for a valid UTF-8 176 // sequence we only allow the byte (here byte three) after the 177 // sequence to be none, an ascii character, or a new UTF-8 marker 178 // (which is more limiting than not to allow continuation bytes 179 // (b3 < 0x80 && b3 >= OxBF) only). 180 $b2 = ($len>1) ? ord($seq[1]) : 0; 181 $b3 = ($len>2) ? ord($seq[2]) : 0; 182 if ($b1>=0xC2 && $b1<0xE0 && $b2>=0x80 && $b2<0xC0 183 && ($b3<0x80 || ($b3>=0xC2 && $b3<0xF5))) { 184 return 2; 185 } 186 187 // We're not there, test for a 3 byte byte sequence. See the comment 188 // above on testing the 4th byte. 189 $b4 = ($len>3) ? ord($seq[3]) : 0; 190 if ($b1>=0xE0 && $b1<0xF0 191 && $b2>=0x80 && $b2<0xC0 && $b3>=0x80 && $b3<0xC0 192 && ($b4<0x80 || ($b4>=0xC2 && $b4<0xF5))) { 193 return 3; 194 } 195 196 // We're not there, test for a 3 byte byte sequence. See the comment 197 // above on testing the 5th byte. Also note that b2 is restricted 198 // so that we keep in the <= U+10FFFF range 199 $b5 = ($len>4) ? ord($seq[4]) : 0; 200 if ($b1>=0xF0 && $b1<0xF5 && $b2>=0x80 && $b2<0xC0 201 && $b3>=0x80 && $b3<0xC0 && $b4>=0x80 && $b4<0xC0 202 && ($b5<0x80 || ($b5>=0xC2 && $b5<0xF5))) { 203 return 4; 204 } 205 206 // This is not a valid UTF-8 sequence. 207 return 0; 208 } 209 210 /** 211 * Convert a string with UTF-8 and code page characters to a valid UTF-8 212 * string. 213 * 214 * When converting the input string to UTF-8 all bytes in the 0x80-0xFF 215 * range are first tested if they are is a valid UTF-8 byte sequences, if 216 * not it is assumed that it is an 8 byte code page character and 217 * converted according to the given encoding. Supported encodings are: 218 * 219 * * Utf8string::ENC_ISO_8859_1 220 * * Utf8string::ENC_CP_1251 221 * 222 * @param string $str The string with mixed UTF-8 and and 8 byte code 223 * page characters. 224 * @param string $encoding The encoding to use when converting 8 byte code 225 * page characters to UTF-8. 226 * 227 * @return string A valid UTF-8 string. 228 */ 229 private function fixCodePageString($str, $encoding) { 230 231 // set the encoding 232 if ($encoding != self::ENC_ISO_8859_1 233 && $encoding != self::ENC_CP_1251) { 234 throw 235 new \Scrivo\SystemException("Unsupported encoding: $encoding"); 236 } 237 238 // Split the data on any occurance of a byte with the high bit set 239 $parts = 240 preg_split('/[\x80-\xFF]/', $str, -1, PREG_SPLIT_OFFSET_CAPTURE); 241 242 // See if there's anything to do 243 $c = count($parts); 244 if ($c<=1) { 245 return $str; 246 } 247 248 // Start with the first part 249 $out = $parts[0][0]; 250 for ($i=1; $i<$c; $i++) { 251 // Get a 6 byte sequence on a split location ... 252 $seq = substr($str, $parts[$i][1]-1, 6); 253 // ... and check if is a valid UTF-8 byte sequence, ... 254 $utf8_seq_width = $this->isUtf8Sequence($seq); 255 if ($utf8_seq_width) { 256 // ... if so add it to output ... 257 $res = substr($seq, 0, $utf8_seq_width); 258 // ... and jump over the parts. 259 $i += ($utf8_seq_width - 1); 260 } else { 261 // ... else treat it as a codepage character 262 $res = self::$maps[$encoding][ord($seq[0])]; 263 } 264 // add the UTF-8 character and next part to the output 265 $out .= $res.$parts[$i][0]; 266 } 267 268 return $out; 269 } 270 271 /** 272 * Convert a string with HTML entities, UTF-8 and code page characters 273 * to a valid UTF-8 string. 274 * 275 * When converting the input string to UTF-8 all bytes in the 0x80-0xFF 276 * range are first tested if they are is a valid UTF-8 byte sequences, if 277 * not it is assumed that it is an 8 byte code page character and 278 * converted according to the given encoding. Supported encodings are: 279 * 280 * * Utf8string::ENC_ISO_8859_1 281 * * Utf8string::ENC_CP_1251 282 * 283 * You can opt to convert HTML entities in the string to their 284 * corresponding characters. Possible choices are: 285 * 286 * * Utf8string::DECODE_NONE don't decode HTML entities 287 * * Utf8string::DECODE_ALL, decode all HTML entities; 288 * * Utf8string::DECODE_UNRESERVED, decode all but the HTML entities 289 * for <>&' and ' (HTML/XML) 290 * 291 * @param string $str The source string, a possible mixture of HTML 292 * entities, UTF-8 and code page characters. 293 * @param int $toDecode Which entities 294 * @param string $encoding The encoding to use when converting 8 byte code 295 * page characters to UTF-8. 296 * 297 * @return string A valid UTF-8 string. 298 */ 299 private function fixString($str, $toDecode=self::DECODE_NONE, 300 $encoding="UTF-8") { 301 302 // List of HTML-entities we want to keep. 303 $reserved = array( 304 "<", ">", "&", """, "'", 305 "<", ">", "&", """, "'", 306 "<",">", "&", """, "'" 307 ); 308 309 // List of HTML-entity markers to replace the ones you want to 310 // keep, so html_entity_decode will leave them alone. 311 $save = array( 312 "#*@lt!;", "#*@gt!;", "#*@amp!;", "#*@quot!;", "#*@#039!;", 313 "#*@lt!;", "#*@gt!;", "#*@amp!;", "#*@quot!;", "#*@#039!;", 314 "#*@lt!;", "#*@gt!;", "#*@amp!;", "#*@quot!;", "#*@#039!;" 315 ); 316 317 if ($toDecode == self::DECODE_UNRESERVED) { 318 // 'Save' entities for reserved characters. 319 $str = str_replace($reserved, $save, $str); 320 } 321 if ($encoding != "UTF-8") { 322 // Fix characters that are not properly UTF-8 encoded 323 $str = $this->fixCodePageString($str, $encoding); 324 } 325 if ($toDecode != self::DECODE_NONE) { 326 // Change all entities to their corresponding UTF-8 characters. 327 $str = html_entity_decode($str, ENT_QUOTES, "UTF-8"); 328 } 329 if ($toDecode == self::DECODE_UNRESERVED) { 330 // 'Restore' previously saved entities. 331 $str = str_replace(array_slice($save, 0, 5), 332 array_slice($reserved, 0, 5), $str); 333 } 334 335 return $str; 336 } 337 338 /** 339 * Get a substring from a string without first checking the boundaries. 340 * 341 * @param int $start Start offset for the substring, use a negative number 342 * to use an offset from the end of the string. 343 * @param int $length The length of the substring. 344 * 345 * @return \Scrivo\String The requested portion of this string. 346 */ 347 private function unsafeSubstr($start, $length) { 348 return new \Scrivo\String(mb_substr($this->str, $start, $length, "UTF-8")); 349 } 350 351 /** 352 * Construct an \Scrivo\String. 353 * 354 * You can either construct an \Scrivo\String object from a valid UTF-8 string, 355 * or from a string that you expect not to contain valid UTF-8 data. In the 356 * latter case use the $toDecode and/or $encoding parameters. 357 * 358 * Possible choices for $toDecode are: 359 * 360 * * Utf8string::DECODE_NONE don't decode HTML entities 361 * * Utf8string::DECODE_ALL, decode all HTML entities; 362 * * Utf8string::DECODE_UNRESERVED, decode all but the HTML entities 363 * for <>&' and ' (HTML/XML) 364 * 365 * If you expect that the source string contains 8 byte code page character 366 * then you can select the encoding to use to convert them to their 367 * corresponding UTF-8 characters. Supported encodings are: 368 * 369 * * Utf8string::ENC_ISO_8859_1 370 * * Utf8string::ENC_CP_1251 371 * 372 * Note: typical use of the $toDecode and $encoding parameters is when 373 * you want to 'sanitize' data before you store it into a database. Setting 374 * these parameters start CPU intensive procedures so it's best not to use 375 * them in bluk operations (like that inner loop or slashdotted home page). 376 * And remember when all data was safely stored as UTF-8, there will be 377 * no need to 'sanitize' it before displaying. 378 * 379 * @param string $str The source string, a possible mixture of HTML 380 * entities, UTF-8 and code page characters. 381 * @param int $toDecode Which entities 382 * @param string $encoding The encoding to use when converting 8 byte code 383 * page characters to UTF-8. 384 */ 385 public function __construct($str="", $toDecode=self::DECODE_NONE, 386 $encoding="UTF-8") { 387 $str = (string)$str; 388 $this->str = $toDecode==self::DECODE_NONE && $encoding=="UTF-8" ? $str 389 : $this->fixString($str, $toDecode, $encoding); 390 $this->pos = 0; 391 } 392 393 /** 394 * Factory method to construct an \Scrivo\String. 395 * 396 * @see \Scrivo\String::__construct() 397 * 398 * @param string $str The string to create the wrapper for. It is assumed 399 * that this will be a valid UTF-8 string. If this is not the case, 400 * you'll need to set the additional parameters. 401 * @param int $toDecode Which entities 402 * @param string $encoding The encoding to use when converting 8 byte code 403 * 404 * @return \Scrivo\String|\Scrivo\String An \Scrivo\String wrapper object. 405 */ 406 public static function create($str="", $toDecode=self::DECODE_NONE, 407 $encoding="UTF-8") { 408 if (is_array($str)) { 409 foreach($str as $k=>$v) { 410 $str[$k] = self::create($v, $toDecode, $encoding); 411 } 412 return $str; 413 } 414 return new \Scrivo\String($str, $toDecode, $encoding); 415 } 416 417 /** 418 * Get the collator for sorting strings. 419 * 420 * @return \Collator The currently set collator for the \Scrivo\String 421 * class. 422 */ 423 public static function getCollator() { 424 if (!self::$coll) { 425 self::$coll = new \Collator(\Locale::getDefault()); 426 } 427 return self::$coll; 428 } 429 430 /** 431 * Set the collator for sorting strings. 432 * 433 * @param \Collator $coll The collator to use. 434 */ 435 public static function setCollator(\Collator $coll) { 436 self::$coll = $coll; 437 } 438 439 /** 440 * Implementation of the readable properties using the PHP magic 441 * method __get(). 442 * 443 * @param string $name The name of the property to get. 444 * 445 * @return mixed The value of the requested property. 446 */ 447 public function __get($name) { 448 switch($name) { 449 case "length": return $this->getLength(); 450 case "collator": return self::getCollator(); 451 } 452 throw new \Scrivo\SystemException("No such property '$name'."); 453 } 454 455 /** 456 * Return the primitive UTF-8 string for this instance. 457 * 458 * @return string The primitive UTF-8 string for this instance. 459 */ 460 public function __toString() { 461 return $this->str; 462 } 463 464 /** 465 * Test if this string equals another \Scrivo\String object. 466 * 467 * When you want test \Scrivo\String object for equality, use this method 468 * and never the equality operator (==) because then you'll compare 469 * objects and therefore all data members of \Scrivo\String and this can 470 * give you other results (or cast the \Scrivo\String strings to PHP strings 471 * before comparing). 472 * 473 * @param \Scrivo\String $str The string to compare this string to. 474 * 475 * @return boolean True if the given string equals this string. 476 */ 477 public function equals(\Scrivo\String $str) { 478 return (string)$this->str == (string)$str; 479 } 480 481 /** 482 * Get the length of the string. 483 * 484 * @return int The length of the string in characters (not bytes). 485 */ 486 public function getLength() { 487 if ($this->len == -1) { 488 $this->len = mb_strlen($this->str, "UTF-8"); 489 } 490 return $this->len; 491 } 492 493 /** 494 * Return the character count of the string. 495 * 496 * This is an alias for getLength() and part of the implementation of 497 * Countable. 498 * 499 * @return int The length of the string in characters. 500 */ 501 public function count() { 502 return $this->getLength(); 503 } 504 505 /** 506 * Return the current UTF-8 character when iterating. 507 * 508 * Note that this method is part of the implementation of Iterator and 509 * should not be called from an other context. 510 * 511 * @return string The current UTF-8 character in this string when 512 * iterating. 513 */ 514 public function current() { 515 // note: iterator will call valid() before current(). 516 return $this->unsafeSubstr($this->pos, 1); 517 } 518 519 /** 520 * Return the index of the current UTF-8 character when iterating. 521 * 522 * Note that this method is part of the implementation of Iterator and 523 * should not be called from an other context. 524 * 525 * @return int The index of the current UTF-8 character in this string 526 * when iterating. 527 */ 528 public function key() { 529 return $this->pos; 530 } 531 532 /** 533 * Move forward in this string to the next UTF-8 character when iterating. 534 * 535 * Note that this method is part of the implementation of Iterator and 536 * should not be called from an other context. 537 */ 538 public function next() { 539 $this->pos++; 540 } 541 542 /** 543 * Reset the current character index so iterating will (re)start at the 544 * beginning of this string. 545 * 546 * Note that this method is part of the implementation of Iterator and 547 * should not be called from an other context. 548 */ 549 public function rewind() { 550 $this->pos = 0; 551 } 552 553 /** 554 * Check if the current character index for iterating is valid. 555 * 556 * Note that this method is part of the implementation of Iterator and 557 * should not be called from an other context. 558 * 559 * @return boolean True if the current character index is valid else false. 560 */ 561 public function valid() { 562 return ($this->pos >= 0 && $this->pos < $this->getLength()); 563 } 564 565 /** 566 * Illegal method: set a character at a specified index location. 567 * 568 * Note that this method is part of the implementation of ArrayAccess. 569 * \Scrivo\Strings are immutable and therefore it is prohibited to set 570 * elements (characters) in a string, so this method implementation is 571 * not relevant and throws an exception if called. 572 * 573 * @param int $offset 574 * @param string $value 575 * 576 * @throws \Scrivo\SystemException If this method is called. 577 */ 578 public function offsetSet($offset, $value) { 579 throw new \Scrivo\SystemException( 580 "offsetSet can't be called on \Scrivo\String objects"); 581 } 582 583 /** 584 * Get an UTF-8 character from a string using array brackets. 585 * 586 * Note that this method is part of the implementation of ArrayAccess and 587 * should not be called from an other context. 588 * 589 * @param int $offset A character offet in the string. 590 * 591 * @throws \Scrivo\SystemException If the requested offset was out of range. 592 */ 593 public function offsetGet($offset) { 594 if (!$this->offsetExists($offset)) { 595 throw new \Scrivo\SystemException( 596 "String index [$offset] out of bounds"); 597 } 598 return $this->unsafeSubstr($offset, 1); 599 } 600 601 /** 602 * Check if the specified index location in this string is valid. 603 * 604 * Note that this method is part of the implementation of ArrayAccess and 605 * should not be called from an other context. 606 * 607 * @param int $offset A character offet in the string. 608 * 609 * @return boolean True if the specified in index is within the valid range. 610 */ 611 public function offsetExists($offset) { 612 return ($offset >= 0 && $offset < $this->getLength()); 613 } 614 615 /** 616 * Illegal method: unset a character at a specified index location. 617 * 618 * Note that this method is part of the implementation of ArrayAccess. 619 * \Scrivo\Strings are immutable and therefore it is prohibited to unset 620 * elements (characters) in a string, so this method implementation is 621 * not relevant and throws an exception if called. 622 * 623 * @param int $offset 624 * 625 * @throws \Scrivo\SystemException If this method is called. 626 */ 627 public function offsetUnset($offset) { 628 throw new \Scrivo\SystemException( 629 "offsetUnset can't be called on \Scrivo\String objects"); 630 } 631 632 /** 633 * Get a substring from a string using an offset and a length. 634 * 635 * Just like PHP's native substr function this method returns a substring 636 * from this string using an offset and a length. But note that this 637 * method will throw an exception if the offset is invalid. 638 * 639 * @param int $start Start offset for the substring, use a negative number 640 * to use an offset from the end of the string. 641 * @param int $length The length of the substring. 642 * 643 * @return \Scrivo\String The portion of this string specified by the $start 644 * and $length parameter. 645 * 646 * @throws \Scrivo\SystemException if the requested offset was out of range. 647 */ 648 public function substr($start, $length=0xFFFF) { 649 $tmp = $start < 1 ? -$start : $start; 650 if (!$this->offsetExists($tmp)) { 651 throw new \Scrivo\SystemException( 652 "String index [$start] out of bounds"); 653 } 654 return $this->unsafeSubstr($start, $length); 655 } 656 657 /** 658 * Get a substring from a string using a start and end index. 659 * 660 * This method is inspired by it's JAVA counterpart and returns a 661 * substring of this string using an start and end index. 662 * 663 * @param int $start Start offset for the substring. 664 * @param int $end The end offset for the substring. 665 * 666 * @return \Scrivo\String The portion of this string specified by the $start 667 * and $end parameter. 668 * 669 * @throws \Scrivo\SystemException if the requested offset was out of range. 670 */ 671 public function substring($start, $end) { 672 if (!$this->offsetExists($start) || !$this->offsetExists($end) 673 || $start > $end) { 674 throw new \Scrivo\SystemException( 675 "String index [$start, $end] out of bounds"); 676 } 677 return $this->unsafeSubstr($start, $end-$start); 678 } 679 680 /** 681 * Get a trimmed copy of this string. 682 * 683 * Returns a copy of the string, with leading and trailing whitespace 684 * removed. Whitespace characters are: ' ', \t, \r, \n, the character 685 * for a non breaking space. 686 * 687 * @return \Scrivo\String A copy of this string with leading and trailing 688 * white space removed. 689 */ 690 public function trim() { 691 return new \Scrivo\String( 692 preg_replace("/(^[\s ]+)|([\s ]+$)/us", "", $this->str)); 693 } 694 695 /** 696 * Check if the string contains the given substring. 697 * 698 * This is the test you normally use strpos(...) !== false for. 699 * 700 * @param \Scrivo\String $str The string to search for. 701 * @param int $offset An offset from where to start the search. 702 * @param boolean $ignoreCase Set to perform an case insensitive lookup. 703 * 704 * @return boolean True if the given string is contained by this string. 705 * 706 * @throws \Scrivo\SystemException If the $offset is out of range. 707 */ 708 public function contains(\Scrivo\String $str, $offset=0, $ignoreCase=false) { 709 if ($offset && !$this->offsetExists($offset)) { 710 throw new \Scrivo\SystemException( 711 "String index [$offset] out of bounds"); 712 } 713 if ($ignoreCase) { 714 return mb_stripos( 715 $this->str, (string)$str, $offset, "UTF-8") !== false; 716 } else { 717 // binary is ok to do 718 return strpos($this->str, (string)$str, $offset) !== false; 719 } 720 } 721 722 /** 723 * Returns the index of the given substring in this string. 724 * 725 * Just like the PHP's native strpos and stripos functions this method 726 * returns the index of a substring in this string. But there are two 727 * important differences: this method returns -1 if the substring was 728 * not found, and this method will raise an exception if the given 729 * offset was out of range. 730 * 731 * @param \Scrivo\String $str The string to search for. 732 * @param int $offset An offset from where to start the search. 733 * @param boolean $ignoreCase Set to perform an case insensitive lookup. 734 * 735 * @return int The index of the first occurance of the substring after 736 * $offset and -1 if the substring was not found. 737 * 738 * @throws \Scrivo\SystemException If the $offset is out of range. 739 */ 740 public function indexOf(\Scrivo\String $str, $offset=0, $ignoreCase=false) { 741 if ($offset && !$this->offsetExists($offset)) { 742 throw new \Scrivo\SystemException( 743 "String index [$offset] out of bounds"); 744 } 745 $res = -1; 746 if ($ignoreCase) { 747 $res = mb_stripos($this->str, $str, $offset, "UTF-8"); 748 } else { 749 $res = mb_strpos($this->str, $str, $offset, "UTF-8"); 750 } 751 return $res !== false ? $res : -1; 752 } 753 754 /** 755 * Returns the index of the last occurance of the given substring in this 756 * string. 757 * 758 * Just like the PHP's native strrpos and strripos functions this method 759 * returns the substring of this string that start with the first occurance 760 * of the given a substring in this string. But note that this 761 * method will throw an exception if the offset is invalid. 762 * Also an negative offset to indicate an offset measured from the end 763 * of the string is allowed. But there are two important differences: 764 * this method returns -1 if the substring was not found, and this method 765 * will raise an exception if the given offset was out of range. 766 * 767 * @param \Scrivo\String $str The string to search for. 768 * @param int $offset An offset from where to start the search. A positive 769 * value indicates an offset measured from the start of the string, a 770 * negative value from the end of the string. 771 * @param boolean $ignoreCase Perform an case insensitive lookup. 772 * 773 * @return int The index of the last occurance of the substring after 774 * $offset. 775 * @throws \Scrivo\SystemException If the $offset is out of range. 776 */ 777 public function lastIndexOf(\Scrivo\String $str, $offset=0, $ignoreCase=false) { 778 if ($offset) { 779 $tmp = $offset < 1 ? -$offset : $offset; 780 if (!$this->offsetExists($tmp)) { 781 throw new \Scrivo\SystemException( 782 "String index [$offset] out of bounds"); 783 } 784 } 785 $res = -1; 786 if ($ignoreCase) { 787 $res = mb_strripos($this->str, $str, $offset, "UTF-8"); 788 } else { 789 $res = mb_strrpos($this->str, $str, $offset, "UTF-8"); 790 } 791 return $res !== false ? $res : -1; 792 } 793 794 /** 795 * Returns the first occurance of a given substring in this string. 796 * 797 * Just like the PHP's native strstr and stristr functions this method 798 * returns the substring of this string that start with the first occurance 799 * of the given a substring in this string. Note that this method throws 800 * an exception if an empty string was given as search string and not 801 * a warning as strstr does. 802 * 803 * @param \Scrivo\String $str The string to search for. 804 * @param int $part Flag to indicate to return the part of the string 805 * before the first occurance of the given substring i.o. the part 806 * after the substring. 807 * @param boolean $ignoreCase Perform an case insensitive lookup. 808 * 809 * @return \Scrivo\String The substring plus the part of the string after the 810 * the first occurance of the substring, or the part of the string before 811 * the first occurance of the substring (excluding the substring) or NULL 812 * if not found. 813 * 814 * @throws \Scrivo\SystemException If an empty search string was given. 815 */ 816 public function firstOccurranceOf(\Scrivo\String $str, $part=false, 817 $ignoreCase=false) { 818 if (!$str->getLength()) { 819 throw new \Scrivo\SystemException( 820 "firstOccurranceOf requires a search string"); 821 } 822 $res = NULL; 823 if ($ignoreCase) { 824 $res = mb_stristr($this->str, $str, $part, "UTF-8"); 825 } else { 826 $res = mb_strstr($this->str, $str, $part, "UTF-8"); 827 } 828 return $res !== false ? new \Scrivo\String($res) : NULL; 829 } 830 831 /** 832 * Returns the last occurance of a given character in this string. 833 * 834 * Just like the PHP's native strrchr and strrichr functions this method 835 * returns the substring of this string that start with the first occurance 836 * of the given a substring in this string. Note that this method throws 837 * an exception if an empty string was given as search string and not 838 * a warning as strstr does. 839 * 840 * @param \Scrivo\String $str The character to search for. 841 * @param int $part Flag to indicate to return part of the string before 842 * the last occurance of the given character i.o. the part after the 843 * character. 844 * @param boolean $ignoreCase Perform an case insensitive lookup. 845 * 846 * @return \Scrivo\String The substring plus the part of the string after the 847 * the last occurance of the character, or the part of the string before 848 * the last occurance of the character (excluding the character) or NULL 849 * if not found. 850 * 851 * @throws \Scrivo\SystemException If a search string of not exactly one 852 * character in length was given. 853 */ 854 public function lastOccurranceOf(\Scrivo\String $str, $part=false, 855 $ignoreCase=false) { 856 if ($str->getLength() != 1) { 857 throw new \Scrivo\SystemException( 858 "lastOccurranceOf accepts single charaters only"); 859 } 860 $res = NULL; 861 if ($ignoreCase) { 862 $res = mb_strrichr($this->str, $str, $part, "UTF-8"); 863 } else { 864 $res = mb_strrchr($this->str, $str, $part, "UTF-8"); 865 } 866 return $res !== false ? new \Scrivo\String($res) : NULL; 867 } 868 869 /** 870 * Replace a substring or set of substrings in this string. 871 * 872 * You can use this method in favour of PHP's native str_replace and strtr 873 * functions. This method will do proper type checking for you. 874 * 875 * @param \Scrivo\String|\Scrivo\String[] $from A (set of) string(s) to replace 876 * in this string. 877 * @param \Scrivo\String|\Scrivo\String[] $to A (set of) replacement string(s) to 878 * replace the found string(s). 879 * 880 * @return \Scrivo\String A string with the replaced values. 881 * 882 * @throws \Scrivo\SystemException If the input data is not of type 883 * \Scrivo\String or \Scrivo\String[], of if the $to parameter is an array 884 * and $from isn't or hasn't the same number of elements. 885 */ 886 public function replace($from, $to) { 887 if ($from instanceof \Scrivo\String && $to instanceof \Scrivo\String) { 888 return new \Scrivo\String(str_replace($from, $to, $this->str)); 889 } else if (is_array($from) && $to instanceof \Scrivo\String) { 890 foreach ($from as $k=>$v) { 891 if (!($v instanceof \Scrivo\String)) { 892 throw new \Scrivo\SystemException("From element is" 893 . " not an \Scrivo\String as array position [$k]"); 894 } 895 } 896 return new \Scrivo\String(str_replace($from, $to, $this->str)); 897 } else if (is_array($from) && is_array($to)) { 898 if (count($from) != count($to)) { 899 throw new \Scrivo\SystemException( 900 "Input arrays are not the same size"); 901 } 902 foreach ($from as $k=>$v) { 903 if (!($v instanceof \Scrivo\String) 904 || !($to[$k] instanceof \Scrivo\String)) { 905 throw new \Scrivo\SystemException("To or from element is" 906 . " not an \Scrivo\String as array position [$k]"); 907 } 908 } 909 return new \Scrivo\String(str_replace($from, $to, $this->str)); 910 } 911 throw new \Scrivo\SystemException("Invalid argument types"); 912 } 913 914 /** 915 * Split this string using a delimiter. 916 * 917 * Just like PHP's native explode this method splits a string on 918 * boundaries formed by the string delimiter. Note that the behavoir 919 * of the limit parameter is a little bit different and that this method 920 * will throw an exception if an empty string is passed as a delimiter. 921 * 922 * @param \Scrivo\String $delimiter The boundary string. 923 * @param int $limit If limit is set and positive, the returned array 924 * will contain a maximum of limit elements with the last element 925 * containing the rest of string. If the limit parameter is negative, 926 * all components except the last -limit are returned. If the limit is 927 * not set or 0 no limit wil be used. 928 * 929 * @return \Scrivo\String[] An array of strings created by splitting the 930 * string parameter on boundaries formed by the delimiter. If the 931 * delimiter was not found and array containing a copy of this string 932 * will be returned except if limit was negative, in that case an 933 * empty array will be returned. 934 * 935 * @throws \Scrivo\SystemException If an empty search string was given. 936 */ 937 public function split(\Scrivo\String $delimiter, $limit=0) { 938 if ($delimiter == "") { 939 throw new \Scrivo\SystemException( 940 "split cannot use an empty \"\" delimiter."); 941 } 942 $r = $limit ? explode($delimiter, $this->str, $limit) 943 : explode($delimiter, $this->str); 944 foreach ($r as $k=>$v) { 945 $r[$k] = new \Scrivo\String($v); 946 } 947 return $r; 948 } 949 950 /** 951 * Get a copy of this string with all of its characters converted to lower 952 * case. 953 * 954 * @return \Scrivo\String A string containing only lower case characters. 955 */ 956 public function toLowerCase() { 957 return new \Scrivo\String(mb_strtolower($this->str, "UTF-8")); 958 } 959 960 /** 961 * Get a copy of this string with all of its characters converted to upper 962 * case. 963 * 964 * @return \Scrivo\String A string containing only upper case characters. 965 */ 966 public function toUpperCase() { 967 return new \Scrivo\String(mb_strtoupper($this->str, "UTF-8")); 968 } 969 970 /** 971 * Compare this string to another \Scrivo\String object. 972 * 973 * Note that this method requires the \Scrivo\String collator to be set, 974 * else the method falls back to the default locale for creating a 975 * collator and generates a warning. 976 * 977 * @param \Scrivo\String $str The string to compare this string to. 978 * 979 * @return int Less than 0 if this string is less than the given 980 * string $str; more than 0 if this string is greater than $str, and 981 * 0 if they are equal. 982 */ 983 public function compareTo(\Scrivo\String $str) { 984 return self::getCollator()->compare($this->str, $str); 985 } 986 987 /** 988 * Check if this string exists an array of \Scrivo\String-s. 989 * 990 * @param \Scrivo\String $arr The array to search. 991 * 992 * @return mixed If found the key of the first occurance of the string 993 * in the array, else null. 994 */ 995 public function inArray($arr) { 996 foreach ($arr as $k=>$v) { 997 if ($v->equals($this)) { 998 return $k; 999 } 1000 } 1001 return null; 1002 } 1003 } 1004 1005 ?>
Documentation generated by phpDocumentor 2.0.0a12 and ScrivoDocumentor on August 29, 2013