File: demo_UTF8.php

Recommend this page to a friend!

demo_UTF8.php

File:	`demo_UTF8.php`
Role:	Example script
Content type:	`text/plain`
Description:	Demonstration Script
Class:	PHP UTF-8 Validation Validate and repair strings in UTF-8 encoding
Author:	By Ray Paseur
Last change:
Date:	6 years ago
Size:	`3,398 bytes`

Download


<?php // classes/demo_UTF8.php

/**

 * This script uses class_UTF8 to determine if a string is UTF-8 compatible.

 *

 * The constructor receives a string and returns an object containing the

 * string and a validity indicator.  If the string fails UTF-8 validation,

 * the offset location of the failures will be provided in an array in the

 * "error" property.

 *

 * The class can also attempt to repair damaged encodings, but the outcome

 * of repairs is less certain.  PHP converts extended ASCII into UTF-8 by

 * putting hex C0 in front of the extended ASCII characters, thus

 *

 */

error_reporting(E_ALL);

require_once('class_UTF8.php');





echo '<meta charset="utf-8" />';

echo '<pre>';





// Some UTF-8 test data - both good and bad

$arr =

[ 'ABCDEF'

, '14�F is cold!'

, 'Gr��e'

, '�'

, chr(0xC3) . chr(0x86)               // AE Ligature in UTF-8

, chr(0xE2) . chr(0x82) . chr(0xAC)   // Euro in UTF-8



// These are examples of bad UTF-8 because they have code points in 127 < char < 256

, chr(0xC6) . ' AE Ligature'

, 'Accented "a" ' . chr(0xE0) . ' in this string'

, 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'



// A UTF-8 nemesis from MSFT Notepad

, chr(0xEF) . chr(0xBB) . chr(0xBF) . 'Thanks for the BOM, Notepad'



// A Bogus character that should not be translated

, 'Bogus 0x81: ' . chr(0x81)



// Anthony Ferrara test data

, chr(0xC0) . chr(0x80)          // Overlong encoding of code point 0

, chr(0xF8) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80)  // Overlong encoding of 5 byte encoding

, chr(0xFC) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80) . chr(0x80)  // Overlong encoding of 6 byte encoding

, chr(0xD0) . chr(0x01)          // High code-point without trailing characters

, chr(0x01) . chr(0x01) . chr(0x01) // Actually valid ;-)



];





echo '<h3>Data Not Repaired</h3>';

foreach ($arr as $str) {

    hexdump($str);

    echo PHP_EOL;



    $obj = new UTF8($str);

    hexdump($obj->str);

    print_r($obj);

    echo PHP_EOL;

}





// Some Bad UTF-8 test data that we attempt to repair

$bad =

[ 'AE Ligature at end: ' . chr(0xC6)

, 'Pound at end: ' . chr(0xA3)

, 'The ' . chr(0x80) . ' Euro symbol'

, 'Several ' . chr(0x80) . ' Euro ' . chr(0x80) . ' symbols ' . chr(0x80) . ' in ' . chr(0x80) . ' text'



// A Bogus character that cannot be translated

, 'Bogus 0x81: ' . chr(0x81)

];



echo '<h3>Data Repair Attempted</h3>';



foreach ($bad as $str) {

    hexdump($str);

    echo PHP_EOL;



    $obj = new UTF8($str, TRUE);

    hexdump($obj->str);

    print_r($obj);

    echo PHP_EOL;

}







// Unrelated utility function to show us the hex byte values

function hexdump($str, $br=PHP_EOL)

{

    if (empty($str)) return FALSE;



    // Get the hex byte values in a string

    $hex = str_split(implode(NULL, unpack('H*', $str)));



    // Allocate bytes into hi and lo nibbles

    $hi  = NULL;

    $lo  = NULL;

    $mod = 0;

    foreach ($hex as $nib)

    {

        $mod++;

        $mod = $mod % 2;

        if ($mod) {

            $hi .= $nib;

        }

        else {

            $lo .= $nib;

        }

    }



    // Show the scale, the string and the hex

    $num = substr('1...5...10...15...20...25...30...35...40...45...50...55...60...65...70...75...80...85...90...95..100..105..110..115..120..125..130', 0, strlen($str));

    echo $br . $num;

    echo $br . $str;

    echo $br . $hi;

    echo $br . $lo;

    echo $br;

}

About us

Advertise on this site

For more information send a message to info at phpclasses dot org.

File: demo_UTF8.php

Contents