PHP Classes

File: demo_UTF8.php

Recommend this page to a friend!
  Classes of Ray Paseur   PHP UTF-8 Validation   demo_UTF8.php   Download  
File: demo_UTF8.php
Role: Example script
Content type: text/plain
Description: Demonstration Script
Class: PHP UTF-8 Validation
Validate and repair strings in UTF-8 encoding
Author: By
Last change:
Date: 5 years ago
Size: 3,398 bytes
 

Contents

Class file image Download
<?php // classes/demo_UTF8.php
/**
 * This script uses class_UTF8 to determine if a string is UTF-8 compatible.
 *
 * The constructor receives a string and returns an object containing the
 * string and a validity indicator. If the string fails UTF-8 validation,
 * the offset location of the failures will be provided in an array in the
 * "error" property.
 *
 * The class can also attempt to repair damaged encodings, but the outcome
 * of repairs is less certain. PHP converts extended ASCII into UTF-8 by
 * putting hex C0 in front of the extended ASCII characters, thus
 *
 */
error_reporting(E_ALL);
require_once(
'class_UTF8.php');


echo
'<meta charset="utf-8" />';
echo
'<pre>';


// Some UTF-8 test data - both good and bad
$arr =
[
'ABCDEF'
, '14°F is cold!'
, 'Größe'
, '©'
, chr(0xC3) . chr(0x86) // AE Ligature in UTF-8
, chr(0xE2) . chr(0x82) . chr(0xAC) // Euro in UTF-8

// These are examples of bad UTF-8 because they have code points in 127 < char < 256
, chr(0xC6) . ' AE Ligature'
, 'Accented "a" ' . chr(0xE0) . ' in this string'
, 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'

// A UTF-8 nemesis from MSFT Notepad
, chr(0xEF) . chr(0xBB) . chr(0xBF) . 'Thanks for the BOM, Notepad'

// A Bogus character that should not be translated
, 'Bogus 0x81: ' . chr(0x81)

// Anthony Ferrara test data
, chr(0xC0) . chr(0x80) // Overlong encoding of code point 0
, chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) // Overlong encoding of 5 byte encoding
, chr(0xFC) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) // Overlong encoding of 6 byte encoding
, chr(0xD0) . chr(0x01) // High code-point without trailing characters
, chr(0x01) . chr(0x01) . chr(0x01) // Actually valid ;-)

];


echo
'<h3>Data Not Repaired</h3>';
foreach (
$arr as $str) {
   
hexdump($str);
    echo
PHP_EOL;

   
$obj = new UTF8($str);
   
hexdump($obj->str);
   
print_r($obj);
    echo
PHP_EOL;
}


// Some Bad UTF-8 test data that we attempt to repair
$bad =
[
'AE Ligature at end: ' . chr(0xC6)
,
'Pound at end: ' . chr(0xA3)
,
'The ' . chr(0x80) . ' Euro symbol'
, 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'

// A Bogus character that cannot be translated
, 'Bogus 0x81: ' . chr(0x81)
];

echo
'<h3>Data Repair Attempted</h3>';

foreach (
$bad as $str) {
   
hexdump($str);
    echo
PHP_EOL;

   
$obj = new UTF8($str, TRUE);
   
hexdump($obj->str);
   
print_r($obj);
    echo
PHP_EOL;
}



// Unrelated utility function to show us the hex byte values
function hexdump($str, $br=PHP_EOL)
{
    if (empty(
$str)) return FALSE;

   
// Get the hex byte values in a string
   
$hex = str_split(implode(NULL, unpack('H*', $str)));

   
// Allocate bytes into hi and lo nibbles
   
$hi = NULL;
   
$lo = NULL;
   
$mod = 0;
    foreach (
$hex as $nib)
    {
       
$mod++;
       
$mod = $mod % 2;
        if (
$mod) {
           
$hi .= $nib;
        }
        else {
           
$lo .= $nib;
        }
    }

   
// Show the scale, the string and the hex
   
$num = substr('1...5...10...15...20...25...30...35...40...45...50...55...60...65...70...75...80...85...90...95..100..105..110..115..120..125..130', 0, strlen($str));
    echo
$br . $num;
    echo
$br . $str;
    echo
$br . $hi;
    echo
$br . $lo;
    echo
$br;
}