Scrivo PHP documentation: String.php

   1  <?php
   2  /* Copyright (c) 2012, Geert Bergman (geert@scrivo.nl)
   3   * All rights reserved.
   4   *
   5   * Redistribution and use in source and binary forms, with or without
   6   * modification, are permitted provided that the following conditions are met:
   7   *
   8   * 1. Redistributions of source code must retain the above copyright notice,
   9   *    this list of conditions and the following disclaimer.
  10   * 2. Redistributions in binary form must reproduce the above copyright notice,
  11   *    this list of conditions and the following disclaimer in the documentation
  12   *    and/or other materials provided with the distribution.
  13   * 3. Neither the name of "Scrivo" nor the names of its contributors may be
  14   *    used to endorse or promote products derived from this software without
  15   *    specific prior written permission.
  16   *
  17   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  21   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27   * POSSIBILITY OF SUCH DAMAGE.
  28   *
  29   * $Id: String.php 841 2013-08-19 22:19:47Z geert $
  30   */
  31  
  32  /**
  33   * Implementation of the \Scrivo\String class.
  34   */
  35  
  36  namespace Scrivo;
  37  
  38  /**
  39   * Wrapper class for PHP strings to enforce consistent and safe multi-byte
  40   * (UTF-8) string handling.
  41   *
  42   * \Scrivo\String is a primitive wrapper class for PHP strings to make sure that
  43   * all operations performed on the string are UTF-8 safe. As PHP does not
  44   * enforce a consistent way to deal with multibyte strings we do it
  45   * ourselves. In the Scrivo code base UTF-8 is the only encoding that is
  46   * supported for operations on data and these operations should be done
  47   * through instances of the \Scrivo\String class. If strings are used as byte
  48   * arrays, use the ByteArray class.
  49   *
  50   * \Scrivo\String objects are imutable: once created you can't change them. All
  51   * operations on a \Scrivo\String object will return a new \Scrivo\String object.
  52   *
  53   * Although we'll be working with UTF-8 exclusively it is possible to create
  54   * \Scrivo\String objects that contain characters from 8 byte encoding schemes.
  55   * Also a note on HTML entities, we work with UTF-8 so you don't need them:
  56   * they are evil. Except entities for the reserved HTML characters (<>&'")
  57   * there is really no use for them in UTF-8 strings. And when stored in a
  58   * database only cause sorting and lookup errors. Therefore when construction
  59   * \Scrivo\String objects you can opt to convert existing HTML entities to their
  60   * corresonding UTF-8 characters.
  61   *
  62   * The current locale setting for LC_COLLATE is important.
  63   * \Scrivo\String::compareTo() will use this setting when comparing strings.
  64   *
  65   * Please note: you might be tempted to do string comparison using
  66   * equality operators (==). Although this works in most cases don't do this:
  67   * you'll do PHP object comparison (i.e. comparing a
  68   * \Scrivo\String object) and that is not what you want: use \Scrivo\String::equals()
  69   * or \Scrivo\String::compareTo() to compare strings.
  70   */
  71  class String implements \Iterator, \ArrayAccess, \Countable {
  72  
  73      /**
  74       * Constant to denote ISO-8859-1 encoding. This is the default encoding
  75       * for \Scrivo\String uses for fixing and comparing.
  76       */
  77      const ENC_ISO_8859_1 = "ISO-8859-1";
  78  
  79      /**
  80       * Constant to denote CP-1251 encoding.
  81       */
  82      const ENC_CP_1251 = "CP-1251";
  83  
  84      /**
  85       * Constant to indicate that you don't want to decode any entities when
  86       * constructing the string.
  87       */
  88      const DECODE_NONE = 0;
  89  
  90      /**
  91       * Constant to indicate that you want to decode all entities when
  92       * constructing the string.
  93       */
  94      const DECODE_ALL = 1;
  95  
  96      /**
  97       * Constant to indicate that you want to decode all but the entities for
  98       * reserved characters (&<>'") when constructing the string.
  99       */
 100      const DECODE_UNRESERVED = 2;
 101  
 102      /**
 103       * The primitive UTF-8 string.
 104       * @var string
 105       */
 106      private $str;
 107  
 108      /**
 109       * The current position when iterating.
 110       * @var string
 111       */
 112      private $pos;
 113  
 114      /**
 115       * The length of the string (characters not bytes).
 116       * @var int
 117       */
 118      private $len = -1;
 119  
 120      /**
 121       * Collator used for sorting. This is a static shared amongst instances.
 122       * @var \Collator
 123       */
 124      private static $coll;
 125  
 126      /**
 127       * Map to translate 8 byte code page characters to UTF-8 sequences.
 128       * @var array[]
 129       */
 130      private static $maps = array(
 131          self::ENC_ISO_8859_1 => array(128 =>
 132              "€","�","‚","ƒ","„","…","†","‡","ˆ","‰","Š","‹","Œ","�","Ž","�",
 133              "�","‘","’","“","”","•","–","—","˜","™","š","›","œ","�","ž","Ÿ",
 134              " ","¡","¢","£","¤","¥","¦","§","¨","©","ª","«","¬","","®","¯",
 135              "°","±","²","³","´","µ","¶","·","¸","¹","º","»","¼","½","¾","¿",
 136              "À","Á","Â","Ã","Ä","Å","Æ","Ç","È","É","Ê","Ë","Ì","Í","Î","Ï",
 137              "Ð","Ñ","Ò","Ó","Ô","Õ","Ö","×","Ø","Ù","Ú","Û","Ü","Ý","Þ","ß",
 138              "à","á","â","ã","ä","å","æ","ç","è","é","ê","ë","ì","í","î","ï",
 139              "ð","ñ","ò","ó","ô","õ","ö","÷","ø","ù","ú","û","ü","ý","þ","ÿ",
 140          ),
 141          self::ENC_CP_1251 => array(128 =>
 142              "Ђ","Ѓ","‚","ѓ","„","…","†","‡","€","‰","Љ","‹","Њ","Ќ","Ћ","Џ",
 143              "ђ","‘","’","“","”","•","–","—","�","™","љ","›","њ","ќ","ћ","џ",
 144              " ","Ў","ў","Ј","¤","Ґ","¦","§","Ё","©","Є","«","¬","","®","Ї",
 145              "°","±","І","і","ґ","µ","¶","·","ё","№","є","»","ј","Ѕ","ѕ","ї",
 146              "А","Б","В","Г","Д","Е","Ж","З","И","Й","К","Л","М","Н","О","П",
 147              "Р","С","Т","У","Ф","Х","Ц","Ч","Ш","Щ","Ъ","Ы","Ь","Э","Ю","Я",
 148              "а","б","в","г","д","е","ж","з","и","й","к","л","м","н","о","п",
 149              "р","с","т","у","ф","х","ц","ч","ш","щ","ъ","ы","ь","э","ю","я",
 150          ),
 151      );
 152  
 153      /**
 154       * Test if a given byte sequence is a valid UTF-8 sequence.
 155       *
 156       * If the tested byte sequence is  a valid UTF-8 sequence the method
 157       * returns the length of the sequence, else the method returns 0.
 158       *
 159       * @param string $seq The byte sequence to test.
 160       *
 161       * @return int The length of the UTF-8 sequence (2-4) or 0 if the
 162       *   sequence is not an UTF-8 sequence.
 163       */
 164      private function isUtf8Sequence($seq) {
 165  
 166          // check if the first byte is a UTF-8 marker and if not end it here
 167          $b1 = ord($seq[0]);
 168          if ($b1<0xC2 || $b1>=0xF5) {
 169             return 0;
 170          }
 171  
 172          // get the length to prevent overshooting when reading
 173          $len = strlen($seq);
 174  
 175          // Get the 2nd and 3rd byte and test it, note that for a valid UTF-8
 176          // sequence we only allow the byte (here byte three) after the
 177          // sequence to be none, an ascii character, or a new UTF-8 marker
 178          // (which is more limiting than not to allow continuation bytes
 179          // (b3 < 0x80 && b3 >= OxBF) only).
 180          $b2 = ($len>1) ? ord($seq[1]) : 0;
 181          $b3 = ($len>2) ? ord($seq[2]) : 0;
 182          if ($b1>=0xC2 && $b1<0xE0 && $b2>=0x80 && $b2<0xC0
 183                  && ($b3<0x80 || ($b3>=0xC2 && $b3<0xF5))) {
 184              return 2;
 185          }
 186  
 187          // We're not there, test for a 3 byte byte sequence. See the comment
 188          // above on testing the 4th byte.
 189          $b4 = ($len>3) ? ord($seq[3]) : 0;
 190          if ($b1>=0xE0 && $b1<0xF0
 191                  && $b2>=0x80 && $b2<0xC0 && $b3>=0x80 && $b3<0xC0
 192                  && ($b4<0x80 || ($b4>=0xC2 && $b4<0xF5))) {
 193              return 3;
 194          }
 195  
 196          // We're not there, test for a 3 byte byte sequence. See the comment
 197          // above on testing the 5th byte. Also note that b2 is restricted
 198          // so that we keep in the <= U+10FFFF range
 199          $b5 = ($len>4) ? ord($seq[4]) : 0;
 200          if ($b1>=0xF0 && $b1<0xF5 && $b2>=0x80 && $b2<0xC0
 201                  && $b3>=0x80 && $b3<0xC0 && $b4>=0x80 && $b4<0xC0
 202                  && ($b5<0x80 || ($b5>=0xC2 && $b5<0xF5))) {
 203              return 4;
 204          }
 205  
 206          // This is not a valid UTF-8 sequence.
 207          return 0;
 208      }
 209  
 210      /**
 211       * Convert a string with UTF-8 and code page characters to a valid UTF-8
 212       * string.
 213       *
 214       * When converting the input string to UTF-8 all bytes in the 0x80-0xFF
 215       * range are first tested if they are is a valid UTF-8 byte sequences, if
 216       * not it is assumed that it is an 8 byte code page character and
 217       * converted according to the given encoding. Supported encodings are:
 218       *
 219       * * Utf8string::ENC_ISO_8859_1
 220       * * Utf8string::ENC_CP_1251
 221       *
 222       * @param string $str The string with mixed UTF-8 and and 8 byte code
 223       *   page characters.
 224       * @param string $encoding The encoding to use when converting 8 byte code
 225       *   page characters to UTF-8.
 226       *
 227       * @return string A valid UTF-8 string.
 228       */
 229      private function fixCodePageString($str, $encoding) {
 230  
 231          // set the encoding
 232          if ($encoding != self::ENC_ISO_8859_1
 233                  && $encoding != self::ENC_CP_1251) {
 234              throw
 235                  new \Scrivo\SystemException("Unsupported encoding: $encoding");
 236          }
 237  
 238          // Split the data on any occurance of a byte with the high bit set
 239          $parts =
 240              preg_split('/[\x80-\xFF]/', $str, -1, PREG_SPLIT_OFFSET_CAPTURE);
 241  
 242          // See if there's anything to do
 243          $c = count($parts);
 244          if ($c<=1) {
 245              return $str;
 246          }
 247  
 248          // Start with the first part
 249          $out = $parts[0][0];
 250          for ($i=1; $i<$c; $i++) {
 251              // Get a 6 byte sequence on a split location ...
 252              $seq = substr($str, $parts[$i][1]-1, 6);
 253              // ... and check if is a valid UTF-8 byte sequence, ...
 254              $utf8_seq_width = $this->isUtf8Sequence($seq);
 255              if ($utf8_seq_width) {
 256                  // ... if so add it to output ...
 257                  $res = substr($seq, 0, $utf8_seq_width);
 258                  // ... and jump over the parts.
 259                  $i += ($utf8_seq_width - 1);
 260              } else {
 261                  // ... else treat it as a codepage character
 262                  $res = self::$maps[$encoding][ord($seq[0])];
 263              }
 264              // add the UTF-8 character and next part to the output
 265              $out .= $res.$parts[$i][0];
 266          }
 267  
 268          return $out;
 269      }
 270  
 271      /**
 272       * Convert a string with HTML entities, UTF-8 and code page characters
 273       * to a valid UTF-8 string.
 274       *
 275       * When converting the input string to UTF-8 all bytes in the 0x80-0xFF
 276       * range are first tested if they are is a valid UTF-8 byte sequences, if
 277       * not it is assumed that it is an 8 byte code page character and
 278       * converted according to the given encoding. Supported encodings are:
 279       *
 280       * * Utf8string::ENC_ISO_8859_1
 281       * * Utf8string::ENC_CP_1251
 282       *
 283       * You can opt to convert HTML entities in the string to their
 284       * corresponding characters. Possible choices are:
 285       *
 286       * * Utf8string::DECODE_NONE don't decode HTML entities
 287       * * Utf8string::DECODE_ALL, decode all HTML entities;
 288       * * Utf8string::DECODE_UNRESERVED, decode all but the HTML entities
 289       *     for <>&' and ' (HTML/XML)
 290       *
 291       * @param string $str The source string, a possible mixture of HTML
 292       *   entities, UTF-8 and code page characters.
 293       * @param int $toDecode Which entities
 294       * @param string $encoding The encoding to use when converting 8 byte code
 295       *   page characters to UTF-8.
 296       *
 297       * @return string A valid UTF-8 string.
 298       */
 299      private function fixString($str, $toDecode=self::DECODE_NONE,
 300              $encoding="UTF-8") {
 301  
 302          // List of HTML-entities we want to keep.
 303          $reserved = array(
 304              "&lt;", "&gt;", "&amp;", "&quot;", "&#39;",
 305              "&#60;", "&#62;", "&#38;", "&#34;", "&#039;",
 306              "&#060;","&#062;", "&#038;", "&#034;", "&apos;"
 307          );
 308  
 309          // List of HTML-entity markers to replace the ones you want to
 310          // keep, so html_entity_decode will leave them alone.
 311          $save = array(
 312              "#*@lt!;", "#*@gt!;", "#*@amp!;", "#*@quot!;", "#*@#039!;",
 313              "#*@lt!;", "#*@gt!;", "#*@amp!;", "#*@quot!;", "#*@#039!;",
 314              "#*@lt!;", "#*@gt!;", "#*@amp!;", "#*@quot!;", "#*@#039!;"
 315          );
 316  
 317          if ($toDecode == self::DECODE_UNRESERVED) {
 318              // 'Save' entities for reserved characters.
 319              $str = str_replace($reserved, $save, $str);
 320          }
 321          if ($encoding != "UTF-8") {
 322              // Fix characters that are not properly UTF-8 encoded
 323              $str = $this->fixCodePageString($str, $encoding);
 324          }
 325          if ($toDecode != self::DECODE_NONE) {
 326              // Change all entities to their corresponding UTF-8 characters.
 327              $str = html_entity_decode($str, ENT_QUOTES, "UTF-8");
 328          }
 329          if ($toDecode == self::DECODE_UNRESERVED) {
 330              // 'Restore' previously saved entities.
 331              $str = str_replace(array_slice($save, 0, 5),
 332                  array_slice($reserved, 0, 5), $str);
 333          }
 334  
 335          return $str;
 336      }
 337  
 338      /**
 339       * Get a substring from a string without first checking the boundaries.
 340       *
 341       * @param int $start Start offset for the substring, use a negative number
 342       *   to use an offset from the end of the string.
 343       * @param int $length The length of the substring.
 344       *
 345       * @return \Scrivo\String The requested portion of this string.
 346       */
 347      private function unsafeSubstr($start, $length) {
 348          return new \Scrivo\String(mb_substr($this->str, $start, $length, "UTF-8"));
 349      }
 350  
 351      /**
 352       * Construct an \Scrivo\String.
 353       *
 354       * You can either construct an \Scrivo\String object from a valid UTF-8 string,
 355       * or from a string that you expect not to contain valid UTF-8 data. In the
 356       * latter case use the $toDecode and/or $encoding parameters.
 357       *
 358       * Possible choices for $toDecode are:
 359       *
 360       * * Utf8string::DECODE_NONE don't decode HTML entities
 361       * * Utf8string::DECODE_ALL, decode all HTML entities;
 362       * * Utf8string::DECODE_UNRESERVED, decode all but the HTML entities
 363       *      for <>&' and ' (HTML/XML)
 364       *
 365       * If you expect that the source string contains 8 byte code page character
 366       * then you can select the encoding to use to convert them to their
 367       * corresponding UTF-8 characters. Supported encodings are:
 368       *
 369       * * Utf8string::ENC_ISO_8859_1
 370       * * Utf8string::ENC_CP_1251
 371       *
 372       * Note: typical use of the $toDecode and $encoding parameters is when
 373       * you want to 'sanitize' data before you store it into a database. Setting
 374       * these parameters start CPU intensive procedures so it's best not to use
 375       * them in bluk operations (like that inner loop or slashdotted home page).
 376       * And remember when all data was safely stored as UTF-8, there will be
 377       * no need to 'sanitize' it before displaying.
 378       *
 379       * @param string $str The source string, a possible mixture of HTML
 380       *   entities, UTF-8 and code page characters.
 381       * @param int $toDecode Which entities
 382       * @param string $encoding The encoding to use when converting 8 byte code
 383       *   page characters to UTF-8.
 384       */
 385      public function __construct($str="", $toDecode=self::DECODE_NONE,
 386              $encoding="UTF-8") {
 387          $str = (string)$str;
 388          $this->str = $toDecode==self::DECODE_NONE && $encoding=="UTF-8" ? $str
 389              : $this->fixString($str, $toDecode, $encoding);
 390          $this->pos = 0;
 391      }
 392  
 393      /**
 394       * Factory method to construct an \Scrivo\String.
 395       *
 396       * @see \Scrivo\String::__construct()
 397       *
 398       * @param string $str The string to create the wrapper for. It is assumed
 399       *   that this will be a valid UTF-8 string. If this is not the case,
 400       *   you'll need to set the additional parameters.
 401       * @param int $toDecode Which entities
 402       * @param string $encoding The encoding to use when converting 8 byte code
 403       *
 404       * @return \Scrivo\String|\Scrivo\String An \Scrivo\String wrapper object.
 405       */
 406      public static function create($str="", $toDecode=self::DECODE_NONE,
 407              $encoding="UTF-8") {
 408          if (is_array($str)) {
 409              foreach($str as $k=>$v) {
 410                  $str[$k] = self::create($v, $toDecode, $encoding);
 411              }
 412              return $str;
 413          }
 414          return new \Scrivo\String($str, $toDecode, $encoding);
 415      }
 416  
 417      /**
 418       * Get the collator for sorting strings.
 419       *
 420       * @return \Collator The currently set collator for the \Scrivo\String
 421       *    class.
 422       */
 423      public static function getCollator() {
 424          if (!self::$coll) {
 425              self::$coll = new \Collator(\Locale::getDefault());
 426          }
 427          return self::$coll;
 428      }
 429  
 430      /**
 431       * Set the collator for sorting strings.
 432       *
 433       * @param \Collator $coll The collator to use.
 434       */
 435      public static function setCollator(\Collator $coll) {
 436          self::$coll = $coll;
 437      }
 438  
 439      /**
 440       * Implementation of the readable properties using the PHP magic
 441       * method __get().
 442       *
 443       * @param string $name The name of the property to get.
 444       *
 445       * @return mixed The value of the requested property.
 446       */
 447      public function __get($name) {
 448          switch($name) {
 449              case "length": return $this->getLength();
 450              case "collator": return self::getCollator();
 451          }
 452          throw new \Scrivo\SystemException("No such property '$name'.");
 453      }
 454  
 455      /**
 456       * Return the primitive UTF-8 string for this instance.
 457       *
 458       * @return string The primitive UTF-8 string for this instance.
 459       */
 460      public function __toString() {
 461          return $this->str;
 462      }
 463  
 464      /**
 465       * Test if this string equals another \Scrivo\String object.
 466       *
 467       * When you want test \Scrivo\String object for equality, use this method
 468       * and never the equality operator (==) because then you'll compare
 469       * objects and therefore all data members of \Scrivo\String and this can
 470       * give you other results (or cast the \Scrivo\String strings to PHP strings
 471       * before comparing).
 472       *
 473       * @param \Scrivo\String $str The string to compare this string to.
 474       *
 475       * @return boolean True if the given string equals this string.
 476       */
 477      public function equals(\Scrivo\String $str) {
 478          return (string)$this->str == (string)$str;
 479      }
 480  
 481      /**
 482       * Get the length of the string.
 483       *
 484       * @return int The length of the string in characters (not bytes).
 485       */
 486      public function getLength() {
 487          if ($this->len == -1) {
 488              $this->len = mb_strlen($this->str, "UTF-8");
 489          }
 490          return $this->len;
 491      }
 492  
 493      /**
 494       * Return the character count of the string.
 495       *
 496       * This is an alias for getLength() and part of the implementation of
 497       * Countable.
 498       *
 499       * @return int The length of the string in characters.
 500       */
 501      public function count() {
 502          return $this->getLength();
 503      }
 504  
 505      /**
 506       * Return the current UTF-8 character when iterating.
 507       *
 508       * Note that this method is part of the implementation of Iterator and
 509       * should not be called from an other context.
 510       *
 511       * @return string The current UTF-8 character in this string when
 512       *   iterating.
 513       */
 514      public function current() {
 515          // note: iterator will call valid() before current().
 516          return $this->unsafeSubstr($this->pos, 1);
 517      }
 518  
 519      /**
 520       * Return the index of the current UTF-8 character when iterating.
 521       *
 522       * Note that this method is part of the implementation of Iterator and
 523       * should not be called from an other context.
 524       *
 525       * @return int The index of the current UTF-8 character in this string
 526       *   when iterating.
 527       */
 528      public function key() {
 529          return $this->pos;
 530      }
 531  
 532      /**
 533       * Move forward in this string to the next UTF-8 character when iterating.
 534       *
 535       * Note that this method is part of the implementation of Iterator and
 536       * should not be called from an other context.
 537       */
 538      public function next() {
 539          $this->pos++;
 540      }
 541  
 542      /**
 543       * Reset the current character index so iterating will (re)start at the
 544       * beginning of this string.
 545       *
 546       * Note that this method is part of the implementation of Iterator and
 547       * should not be called from an other context.
 548       */
 549      public function rewind() {
 550          $this->pos = 0;
 551      }
 552  
 553      /**
 554       * Check if the current character index for iterating is valid.
 555       *
 556       * Note that this method is part of the implementation of Iterator and
 557       * should not be called from an other context.
 558       *
 559       * @return boolean True if the current character index is valid else false.
 560       */
 561      public function valid() {
 562          return ($this->pos >= 0 && $this->pos < $this->getLength());
 563      }
 564  
 565      /**
 566       * Illegal method: set a character at a specified index location.
 567       *
 568       * Note that this method is part of the implementation of ArrayAccess.
 569       * \Scrivo\Strings are immutable and therefore it is prohibited to set
 570       * elements (characters) in a string, so this method implementation is
 571       * not relevant and throws an exception if called.
 572       *
 573       * @param int $offset
 574       * @param string $value
 575       *
 576       * @throws \Scrivo\SystemException If this method is called.
 577       */
 578      public function offsetSet($offset, $value) {
 579          throw new \Scrivo\SystemException(
 580              "offsetSet can't be called on \Scrivo\String objects");
 581      }
 582  
 583      /**
 584       * Get an UTF-8 character from a string using array brackets.
 585       *
 586       * Note that this method is part of the implementation of ArrayAccess and
 587       * should not be called from an other context.
 588       *
 589       * @param int $offset A character offet in the string.
 590       *
 591       * @throws \Scrivo\SystemException If the requested offset was out of range.
 592       */
 593      public function offsetGet($offset) {
 594          if (!$this->offsetExists($offset)) {
 595              throw new \Scrivo\SystemException(
 596                  "String index [$offset] out of bounds");
 597          }
 598          return $this->unsafeSubstr($offset, 1);
 599      }
 600  
 601      /**
 602       * Check if the specified index location in this string is valid.
 603       *
 604       * Note that this method is part of the implementation of ArrayAccess and
 605       * should not be called from an other context.
 606       *
 607       * @param int $offset A character offet in the string.
 608       *
 609       * @return boolean True if the specified in index is within the valid range.
 610       */
 611      public function offsetExists($offset) {
 612          return ($offset >= 0 && $offset < $this->getLength());
 613      }
 614  
 615      /**
 616       * Illegal method: unset a character at a specified index location.
 617       *
 618       * Note that this method is part of the implementation of ArrayAccess.
 619       * \Scrivo\Strings are immutable and therefore it is prohibited to unset
 620       * elements (characters) in a string, so this method implementation is
 621       * not relevant and throws an exception if called.
 622       *
 623       * @param int $offset
 624       *
 625       * @throws \Scrivo\SystemException If this method is called.
 626       */
 627      public function offsetUnset($offset) {
 628          throw new \Scrivo\SystemException(
 629              "offsetUnset can't be called on \Scrivo\String objects");
 630      }
 631  
 632      /**
 633       * Get a substring from a string using an offset and a length.
 634       *
 635       * Just like PHP's native substr function this method returns a substring
 636       * from this string using an offset and a length. But note that this
 637       * method will throw an exception if the offset is invalid.
 638       *
 639       * @param int $start Start offset for the substring, use a negative number
 640       *   to use an offset from the end of the string.
 641       * @param int $length The length of the substring.
 642       *
 643       * @return \Scrivo\String The portion of this string specified by the $start
 644       *   and $length parameter.
 645       *
 646       * @throws \Scrivo\SystemException if the requested offset was out of range.
 647       */
 648      public function substr($start, $length=0xFFFF) {
 649          $tmp = $start < 1 ? -$start : $start;
 650          if (!$this->offsetExists($tmp)) {
 651              throw new \Scrivo\SystemException(
 652                  "String index [$start] out of bounds");
 653          }
 654          return $this->unsafeSubstr($start, $length);
 655      }
 656  
 657      /**
 658       * Get a substring from a string using a start and end index.
 659       *
 660       * This method is inspired by it's JAVA counterpart and returns a
 661       * substring of this string using an start and end index.
 662       *
 663       * @param int $start Start offset for the substring.
 664       * @param int $end The end offset for the substring.
 665       *
 666       * @return \Scrivo\String The portion of this string specified by the $start
 667       *   and $end parameter.
 668       *
 669       * @throws \Scrivo\SystemException if the requested offset was out of range.
 670       */
 671      public function substring($start, $end) {
 672          if (!$this->offsetExists($start) || !$this->offsetExists($end)
 673                  || $start > $end) {
 674              throw new \Scrivo\SystemException(
 675                  "String index [$start, $end] out of bounds");
 676          }
 677          return $this->unsafeSubstr($start, $end-$start);
 678      }
 679  
 680      /**
 681       * Get a trimmed copy of this string.
 682       *
 683       * Returns a copy of the string, with leading and trailing whitespace
 684       * removed. Whitespace characters are: ' ', \t, \r, \n, the character
 685       * for a non breaking space.
 686       *
 687       * @return \Scrivo\String A copy of this string with leading and trailing
 688       *   white space removed.
 689       */
 690      public function trim() {
 691          return new \Scrivo\String(
 692              preg_replace("/(^[\s ]+)|([\s ]+$)/us", "", $this->str));
 693      }
 694  
 695      /**
 696       * Check if the string contains the given substring.
 697       *
 698       * This is the test you normally use strpos(...) !== false for.
 699       *
 700       * @param \Scrivo\String $str The string to search for.
 701       * @param int $offset An offset from where to start the search.
 702       * @param boolean $ignoreCase Set to perform an case insensitive lookup.
 703       *
 704       * @return boolean True if the given string is contained by this string.
 705       *
 706       * @throws \Scrivo\SystemException If the $offset is out of range.
 707       */
 708      public function contains(\Scrivo\String $str, $offset=0, $ignoreCase=false) {
 709          if ($offset && !$this->offsetExists($offset)) {
 710              throw new \Scrivo\SystemException(
 711                  "String index [$offset] out of bounds");
 712          }
 713          if ($ignoreCase) {
 714              return mb_stripos(
 715                  $this->str, (string)$str, $offset, "UTF-8") !== false;
 716          } else {
 717              // binary is ok to do
 718              return strpos($this->str, (string)$str, $offset) !== false;
 719          }
 720      }
 721  
 722      /**
 723       * Returns the index of the given substring in this string.
 724       *
 725       * Just like the PHP's native strpos and stripos functions this method
 726       * returns the index of a substring in this string. But there are two
 727       * important differences: this method returns -1 if the substring was
 728       * not found, and this method will raise an exception if the given
 729       * offset was out of range.
 730       *
 731       * @param \Scrivo\String $str The string to search for.
 732       * @param int $offset An offset from where to start the search.
 733       * @param boolean $ignoreCase Set to perform an case insensitive lookup.
 734       *
 735       * @return int The index of the first occurance of the substring after
 736       *   $offset and -1 if the substring was not found.
 737       *
 738       * @throws \Scrivo\SystemException If the $offset is out of range.
 739       */
 740      public function indexOf(\Scrivo\String $str, $offset=0, $ignoreCase=false) {
 741          if ($offset && !$this->offsetExists($offset)) {
 742              throw new \Scrivo\SystemException(
 743                  "String index [$offset] out of bounds");
 744          }
 745          $res = -1;
 746          if ($ignoreCase) {
 747              $res = mb_stripos($this->str, $str, $offset, "UTF-8");
 748          } else {
 749              $res = mb_strpos($this->str, $str, $offset, "UTF-8");
 750          }
 751          return $res !== false ? $res : -1;
 752      }
 753  
 754      /**
 755       * Returns the index of the last occurance of the given substring in this
 756       * string.
 757       *
 758       * Just like the PHP's native strrpos and strripos functions this method
 759       * returns the substring of this string that start with the first occurance
 760       * of the given a substring in this string.  But note that this
 761       * method will throw an exception if the offset is invalid.
 762       * Also an negative offset to indicate an offset measured from the end
 763       * of the string is allowed. But there are two important differences:
 764       * this method returns -1 if the substring was not found, and this method
 765       * will raise an exception if the given offset was out of range.
 766       *
 767       * @param \Scrivo\String $str The string to search for.
 768       * @param int $offset An offset from where to start the search. A positive
 769       *   value indicates an offset measured from the start of the string, a
 770       *   negative value from the end of the string.
 771       * @param boolean $ignoreCase Perform an case insensitive lookup.
 772       *
 773       * @return int The index of the last occurance of the substring after
 774       *   $offset.
 775       * @throws \Scrivo\SystemException If the $offset is out of range.
 776       */
 777      public function lastIndexOf(\Scrivo\String $str, $offset=0, $ignoreCase=false) {
 778          if ($offset) {
 779              $tmp = $offset < 1 ? -$offset : $offset;
 780              if (!$this->offsetExists($tmp)) {
 781                  throw new \Scrivo\SystemException(
 782                      "String index [$offset] out of bounds");
 783              }
 784          }
 785          $res = -1;
 786          if ($ignoreCase) {
 787              $res = mb_strripos($this->str, $str, $offset, "UTF-8");
 788          } else {
 789              $res = mb_strrpos($this->str, $str, $offset, "UTF-8");
 790          }
 791          return $res !== false ? $res : -1;
 792      }
 793  
 794      /**
 795       * Returns the first occurance of a given substring in this string.
 796       *
 797       * Just like the PHP's native strstr and stristr functions this method
 798       * returns the substring of this string that start with the first occurance
 799       * of the given a substring in this string. Note that this method throws
 800       * an exception if an empty string was given as search string and not
 801       * a warning as strstr does.
 802       *
 803       * @param \Scrivo\String $str The string to search for.
 804       * @param int $part Flag to indicate to return the part of the string
 805       *   before the first occurance of the given substring i.o. the part
 806       *   after the substring.
 807       * @param boolean $ignoreCase Perform an case insensitive lookup.
 808       *
 809       * @return \Scrivo\String The substring plus the part of the string after the
 810       *   the first occurance of the substring, or the part of the string before
 811       *   the first occurance of the substring (excluding the substring) or NULL
 812       *   if not found.
 813       *
 814       * @throws \Scrivo\SystemException If an empty search string was given.
 815       */
 816      public function firstOccurranceOf(\Scrivo\String $str, $part=false,
 817              $ignoreCase=false) {
 818          if (!$str->getLength()) {
 819              throw new \Scrivo\SystemException(
 820                  "firstOccurranceOf requires a search string");
 821          }
 822          $res = NULL;
 823          if ($ignoreCase) {
 824              $res = mb_stristr($this->str, $str, $part, "UTF-8");
 825          } else {
 826              $res = mb_strstr($this->str, $str, $part, "UTF-8");
 827          }
 828          return $res !== false ? new \Scrivo\String($res) : NULL;
 829      }
 830  
 831      /**
 832       * Returns the last occurance of a given character in this string.
 833       *
 834       * Just like the PHP's native strrchr and strrichr functions this method
 835       * returns the substring of this string that start with the first occurance
 836       * of the given a substring in this string. Note that this method throws
 837       * an exception if an empty string was given as search string and not
 838       * a warning as strstr does.
 839       *
 840       * @param \Scrivo\String $str The character to search for.
 841       * @param int $part Flag to indicate to return part of the string before
 842       *   the last occurance of the given character i.o. the part after the
 843       *   character.
 844       * @param boolean $ignoreCase Perform an case insensitive lookup.
 845       *
 846       * @return \Scrivo\String The substring plus the part of the string after the
 847       *   the last occurance of the character, or the part of the string before
 848       *   the last occurance of the character (excluding the character) or NULL
 849       *   if not found.
 850       *
 851       * @throws \Scrivo\SystemException If a search string of not exactly one
 852       *   character in length was given.
 853       */
 854      public function lastOccurranceOf(\Scrivo\String $str, $part=false,
 855              $ignoreCase=false) {
 856          if ($str->getLength() != 1) {
 857              throw new \Scrivo\SystemException(
 858                  "lastOccurranceOf accepts single charaters only");
 859          }
 860          $res = NULL;
 861          if ($ignoreCase) {
 862              $res = mb_strrichr($this->str, $str, $part, "UTF-8");
 863          } else {
 864              $res = mb_strrchr($this->str, $str, $part, "UTF-8");
 865          }
 866          return $res !== false ? new \Scrivo\String($res) : NULL;
 867      }
 868  
 869      /**
 870       * Replace a substring or set of substrings in this string.
 871       *
 872       * You can use this method in favour of PHP's native str_replace and strtr
 873       * functions. This method will do proper type checking for you.
 874       *
 875       * @param \Scrivo\String|\Scrivo\String[] $from A (set of) string(s) to replace
 876       *   in this string.
 877       * @param \Scrivo\String|\Scrivo\String[] $to A (set of) replacement string(s) to
 878       *   replace the found string(s).
 879       *
 880       * @return \Scrivo\String A string with the replaced values.
 881       *
 882       * @throws \Scrivo\SystemException If the input data is not of type
 883       *     \Scrivo\String or \Scrivo\String[], of if the $to parameter is an array
 884       *     and $from isn't or hasn't the same number of elements.
 885       */
 886      public function replace($from, $to) {
 887          if ($from instanceof \Scrivo\String && $to instanceof \Scrivo\String) {
 888              return new \Scrivo\String(str_replace($from, $to, $this->str));
 889          } else if (is_array($from) && $to instanceof \Scrivo\String) {
 890              foreach ($from as $k=>$v) {
 891                  if (!($v instanceof \Scrivo\String)) {
 892                      throw new \Scrivo\SystemException("From element is"
 893                          . " not an \Scrivo\String as array position [$k]");
 894                  }
 895              }
 896              return new \Scrivo\String(str_replace($from, $to, $this->str));
 897          } else if (is_array($from) && is_array($to)) {
 898              if (count($from) != count($to)) {
 899                  throw new \Scrivo\SystemException(
 900                      "Input arrays are not the same size");
 901              }
 902              foreach ($from as $k=>$v) {
 903                  if (!($v instanceof \Scrivo\String)
 904                          || !($to[$k] instanceof \Scrivo\String)) {
 905                      throw new \Scrivo\SystemException("To or from element is"
 906                          . " not an \Scrivo\String as array position [$k]");
 907                  }
 908              }
 909              return new \Scrivo\String(str_replace($from, $to, $this->str));
 910          }
 911          throw new \Scrivo\SystemException("Invalid argument types");
 912      }
 913  
 914      /**
 915       * Split this string using a delimiter.
 916       *
 917       * Just like PHP's native explode this method splits a string on
 918       * boundaries formed by the string delimiter. Note that the behavoir
 919       * of the limit parameter is a little bit different and that this method
 920       * will throw an exception if an empty string is passed as a delimiter.
 921       *
 922       * @param \Scrivo\String $delimiter The boundary string.
 923       * @param int $limit If limit is set and positive, the returned array
 924       *     will contain a maximum of limit elements with the last element
 925       *     containing the rest of string. If the limit parameter is negative,
 926       *     all components except the last -limit are returned. If the limit is
 927       *     not set or 0 no limit wil be used.
 928       *
 929       * @return \Scrivo\String[] An array of strings created by splitting the
 930       *     string parameter on boundaries formed by the delimiter. If the
 931       *     delimiter was not found and array containing a copy of this string
 932       *     will be returned except if limit was negative, in that case an
 933       *     empty array will be returned.
 934       *
 935       * @throws \Scrivo\SystemException If an empty search string was given.
 936       */
 937      public function split(\Scrivo\String $delimiter, $limit=0) {
 938          if ($delimiter == "") {
 939              throw new \Scrivo\SystemException(
 940                      "split cannot use an empty \"\" delimiter.");
 941          }
 942          $r = $limit ? explode($delimiter, $this->str, $limit)
 943              : explode($delimiter, $this->str);
 944          foreach ($r as $k=>$v) {
 945              $r[$k] = new \Scrivo\String($v);
 946          }
 947          return $r;
 948      }
 949  
 950      /**
 951       * Get a copy of this string with all of its characters converted to lower
 952       * case.
 953       *
 954       * @return \Scrivo\String A string containing only lower case characters.
 955       */
 956      public function toLowerCase() {
 957          return new \Scrivo\String(mb_strtolower($this->str, "UTF-8"));
 958      }
 959  
 960      /**
 961       * Get a copy of this string with all of its characters converted to upper
 962       * case.
 963       *
 964       * @return \Scrivo\String A string containing only upper case characters.
 965       */
 966      public function toUpperCase() {
 967          return new \Scrivo\String(mb_strtoupper($this->str, "UTF-8"));
 968      }
 969  
 970      /**
 971       * Compare this string to another \Scrivo\String object.
 972       *
 973       * Note that this method requires the \Scrivo\String collator to be set,
 974       * else the method falls back to the default locale for creating a
 975       * collator and generates a warning.
 976       *
 977       * @param \Scrivo\String $str The string to compare this string to.
 978       *
 979       * @return int Less than 0 if this string is less than the given
 980       *   string $str; more than 0 if this string is greater than $str, and
 981       *   0 if they are equal.
 982       */
 983      public function compareTo(\Scrivo\String $str) {
 984          return self::getCollator()->compare($this->str, $str);
 985      }
 986  
 987      /**
 988       * Check if this string exists an array of \Scrivo\String-s.
 989       *
 990       * @param \Scrivo\String $arr The array to search.
 991       *
 992       * @return mixed If found the key of the first occurance of the string
 993       *    in the array, else null.
 994       */
 995      public function inArray($arr) {
 996          foreach ($arr as $k=>$v) {
 997              if ($v->equals($this)) {
 998                  return $k;
 999              }
1000          }
1001          return null;
1002      }
1003  }
1004  
1005  ?>
Documentation generated by phpDocumentor 2.0.0a12 and ScrivoDocumentor on August 29, 2013
Code View