EncodingUtil.php
1 <?php
2 /**
3  * wCMF - wemove Content Management Framework
4  * Copyright (C) 2005-2015 wemove digital solutions GmbH
5  *
6  * Licensed under the terms of the MIT License.
7  *
8  * See the LICENSE file distributed with this work for
9  * additional information.
10  */
11 namespace wcmf\lib\io;
12 
13 /**
14  * EncodingUtil provides helper functions for working with different encodings
15  * mainly UTF-8.
16  *
17  * @author ingo herwig <ingo@wemove.com>
18  */
19 class EncodingUtil {
20 
21  /**
22  * This structure encodes the difference between ISO-8859-1 and Windows-1252,
23  * as a map from the UTF-8 encoding of some ISO-8859-1 control characters to
24  * the UTF-8 encoding of the non-control characters that Windows-1252 places
25  * at the equivalent code points.
26  * code from: http://de3.php.net/manual/de/function.utf8-encode.php#45226
27  */
28  private static $CP1252Map = array(
29  "\xc2\x80" => "\xe2\x82\xac", /* EURO SIGN */
30  "\xc2\x82" => "\xe2\x80\x9a", /* SINGLE LOW-9 QUOTATION MARK */
31  "\xc2\x83" => "\xc6\x92", /* LATIN SMALL LETTER F WITH HOOK */
32  "\xc2\x84" => "\xe2\x80\x9e", /* DOUBLE LOW-9 QUOTATION MARK */
33  "\xc2\x85" => "\xe2\x80\xa6", /* HORIZONTAL ELLIPSIS */
34  "\xc2\x86" => "\xe2\x80\xa0", /* DAGGER */
35  "\xc2\x87" => "\xe2\x80\xa1", /* DOUBLE DAGGER */
36  "\xc2\x88" => "\xcb\x86", /* MODIFIER LETTER CIRCUMFLEX ACCENT */
37  "\xc2\x89" => "\xe2\x80\xb0", /* PER MILLE SIGN */
38  "\xc2\x8a" => "\xc5\xa0", /* LATIN CAPITAL LETTER S WITH CARON */
39  "\xc2\x8b" => "\xe2\x80\xb9", /* SINGLE LEFT-POINTING ANGLE QUOTATION */
40  "\xc2\x8c" => "\xc5\x92", /* LATIN CAPITAL LIGATURE OE */
41  "\xc2\x8e" => "\xc5\xbd", /* LATIN CAPITAL LETTER Z WITH CARON */
42  "\xc2\x91" => "\xe2\x80\x98", /* LEFT SINGLE QUOTATION MARK */
43  "\xc2\x92" => "\xe2\x80\x99", /* RIGHT SINGLE QUOTATION MARK */
44  "\xc2\x93" => "\xe2\x80\x9c", /* LEFT DOUBLE QUOTATION MARK */
45  "\xc2\x94" => "\xe2\x80\x9d", /* RIGHT DOUBLE QUOTATION MARK */
46  "\xc2\x95" => "\xe2\x80\xa2", /* BULLET */
47  "\xc2\x96" => "\xe2\x80\x93", /* EN DASH */
48  "\xc2\x97" => "\xe2\x80\x94", /* EM DASH */
49 
50  "\xc2\x98" => "\xcb\x9c", /* SMALL TILDE */
51  "\xc2\x99" => "\xe2\x84\xa2", /* TRADE MARK SIGN */
52  "\xc2\x9a" => "\xc5\xa1", /* LATIN SMALL LETTER S WITH CARON */
53  "\xc2\x9b" => "\xe2\x80\xba", /* SINGLE RIGHT-POINTING ANGLE QUOTATION*/
54  "\xc2\x9c" => "\xc5\x93", /* LATIN SMALL LIGATURE OE */
55  "\xc2\x9e" => "\xc5\xbe", /* LATIN SMALL LETTER Z WITH CARON */
56  "\xc2\x9f" => "\xc5\xb8" /* LATIN CAPITAL LETTER Y WITH DIAERESIS*/
57  );
58 
59  /**
60  * Returns true if the given string is valid UTF-8 and false otherwise.
61  * @param $string The string to be tested
62  * code from: http://us2.php.net/mb_detect_encoding
63  */
64  public static function isUtf8($string) {
65  if ($string === mb_convert_encoding(mb_convert_encoding($string, "UTF-32", "UTF-8"), "UTF-8", "UTF-32")) {
66  return true;
67  }
68  else {
69  return false;
70  }
71  }
72 
73  /**
74  * Decodes mixed CP1252 UTF-8 strings to ISO.
75  * @param $string The string to be decode
76  * code from: http://www.php.net/manual/en/function.utf8-decode.php#47146
77  */
78  public static function convertCp1252Utf8ToIso($string) {
79  return utf8_decode(strtr($string, array_flip(self::$CP1252Map)));
80  }
81 
82  /**
83  * Encodes ISO strings to mixed CP1252 UTF-8.
84  * @param $string The string to be encode
85  * code from: http://www.php.net/manual/en/function.utf8-decode.php#47146
86  */
87  public static function convertIsoToCp1252Utf8($string) {
88  return strtr(utf8_encode($string), self::$CP1252Map);
89  }
90 
91  /**
92  * Encodes an ISO-8859-1 mixed variable to UTF-8 (PHP 4, PHP 5 compat)
93  * @param $input An array, associative or simple
94  * @param $encodeKeys optional
95  * @return utf-8 encoded input
96  * code from: http://de3.php.net/utf8_encode
97  */
98  public static function utf8EncodeMix($input, $encodeKeys=false) {
99  if(is_array($input)) {
100  $result = array();
101  foreach($input as $k => $v) {
102  $key = ($encodeKeys) ? self::convertIsoToCp1252Utf8($k) : $k;
103  $result[$key] = self::utf8EncodeMix($v, $encodeKeys);
104  }
105  }
106  else {
107  if (!is_int($input) && !is_float($input) && !is_bool($input) && !self::isUtf8($input)) {
108  $result = self::convertIsoToCp1252Utf8($input);
109  } else {
110  $result = $input;
111  }
112  }
113  return $result;
114  }
115 }
116 ?>
static convertIsoToCp1252Utf8($string)
Encodes ISO strings to mixed CP1252 UTF-8.
Input/Output related interfaces and classes.
Definition: namespaces.php:21
EncodingUtil provides helper functions for working with different encodings mainly UTF-8...
static isUtf8($string)
Returns true if the given string is valid UTF-8 and false otherwise.
static utf8EncodeMix($input, $encodeKeys=false)
Encodes an ISO-8859-1 mixed variable to UTF-8 (PHP 4, PHP 5 compat)
static convertCp1252Utf8ToIso($string)
Decodes mixed CP1252 UTF-8 strings to ISO.