File: Maps/unicode-to-ansi.map

Recommend this page to a friend!
  Classes of Christian Vigh  >  PHP PDF to Text  >  Maps/unicode-to-ansi.map  >  Download  
File: Maps/unicode-to-ansi.map
Role: Auxiliary data
Content type: text/plain
Description: Auxiliary data
Class: PHP PDF to Text
Extract text contents from PDF files
Author: By
Last change: Updated mappings
Date: 3 years ago
Size: 6,300 bytes
 

 

Contents

Class file image Download
<?php
	// Maps alien Unicode characters such as special spaces, letters with ligatures to their ascii string equivalent
	$unicode_to_ansi	=  array
	   (
		// Found in some Polish documents - not sure of the translation
		0x0084			=>  '"',		
		0x0085			=>  '...',
		0x0092			=>  '"',
		0x0094			=>  '"',
		0x0096			=>  '-',
		// End Polish

		0x00A0			=>  ' ',				// Non-breakable space
		0x00AB			=>  '"',				// Left pointing double angle quotation mark
		0x00AD			=>  '',					// Break Opportunity After: generally provide a line break opportunity after the character 
 		0x00C6			=>  'AE',				// AE with ligature (&AElig;)
		0x00E6			=>  'ae',				// ae with ligature (&aelig;)
		0x1680			=>  ' ',				// OGHAM space mark
		0x0152			=>  'OE',				// OE with ligature (&OElig;)
		0x0153			=>  'oe',				// oe with ligature (&oelig;)
		0x1D6B			=>  'ue',				// ue with ligature
		0x2000			=>  ' ',				// EN quad
		0x2001			=>  ' ',				// EM quad
		0x2002			=>  ' ',				// EN space
		0x2003			=>  ' ',				// EM space
		0x2004			=>  ' ',				// 3-per-EM space
		0x2005			=>  ' ',				// 4-per-EM space
		0x2006			=>  ' ',				// 6-per-EM space
		0x2007			=>  ' ',				// Figure space 
		0x2008			=>  ' ' ,				// Punctuation space
		0x2009			=>  ' ',				// Thin s1pace
		0x200A			=>  ' ',				// Hair space
		0x200B			=>  ' ',				// Zero-width space
		0x200C			=>  ' ',				// Zero-width non-joiner
		0x200D			=>  '',					// Zero-width joiner
		0x2010			=>  '-',				// Narrow hyphen
		0x2011			=>  '-',				// Non-breaking hyphen
		0x2012			=>  '-',				// Figure dash (has the same width as digits)
		0x2013			=>  '-',				// EN dash (used to indicate range of values)
		0x2014			=>  ' - ',				// EM dash (used to make a break in a flow of sentences)
		0x2015			=>  '- ',				// Horizontal bar, used to introduce quoted text
		0x2018			=>  "'",				// German right single quote
		0x2019			=>  "'",				// Secondary level quotation
		0x201A			=>  "'",				// German left single quote
		0x201B			=>  "'",				// Reversed quote
		0x201C			=>  '"',				// Left double quotation mark
		0x201D			=>  '"',				// Double quote-apostrophe
		0x201E			=>  '"',				// Lower double quote-apostrophe
		0x2026			=>  '...',				// Ellipsis
		0x2028			=>  "\n",				// Line separator
		0x2029			=>  "\n",				// Paragraph separator
		0x202F			=>  ' ',				// Narrow non-break space
		0x2039			=>  "'",				// Single left pointing angle quotation mark
		0x203A			=>  "'",				// Single right pointing angle quotation mark
		0x2053			=>  '~',				// Large tilde
		0x205F			=>  ' ',				// Medium mathematical space
		0x2060			=>  '',					// Word joiner
		0x207B			=>  '-',				// Superscript minus
		0x208B			=>  '-',				// Subscript minus
		0x2160			=>  'I',				// Roman numeral : I
		0x2161			=>  'II',				// Roman numeral : II
		0x2162			=>  'III',				// Roman numeral : III
		0x2163			=>  'IV',				// Roman numeral : IV
		0x2164			=>  'V',				// Roman numeral : V
		0x2165			=>  'VI',				// Roman numeral : VI
		0x2166			=>  'VII',				// Roman numeral : VII
		0x2167			=>  'VIII',				// Roman numeral : VIII
		0x2168			=>  'IX',				// Roman numeral : IX
		0x2169			=>  'X',				// Roman numeral : X
		0x216A			=>  'XI',				// Roman numeral : XI
		0x216B			=>  'XII',				// Roman numeral : XII
		0x216C			=>  'L',				// Roman numeral : L
		0x216D			=>  'C',				// Roman numeral : C
		0x216E			=>  'D',				// Roman numeral : D
		0x216F			=>  'M',				// Roman numeral : M
		0x2170			=>  'i',				// Roman numeral : i
		0x2171			=>  'ii',				// Roman numeral : ii
		0x2172			=>  'iii',				// Roman numeral : iii
		0x2173			=>  'iv',				// Roman numeral : iv
		0x2174			=>  'v',				// Roman numeral : v
		0x2175			=>  'vi',				// Roman numeral : vi
		0x2176			=>  'vii',				// Roman numeral : vii
		0x2177			=>  'viii',				// Roman numeral : viii
		0x2178			=>  'ix',				// Roman numeral : ix
		0x2179			=>  'x',				// Roman numeral : x
		0x217A			=>  'xi',				// Roman numeral : xi
		0x217B			=>  'xii',				// Roman numeral : xii
		0x217C			=>  'l',				// Roman numeral : l
		0x217D			=>  'c',				// Roman numeral : c
		0x217E			=>  'd',				// Roman numeral : d
		0x217F			=>  'm',				// Roman numeral : m
		0x2212			=>  '-',				// Minus sign (arithmetic operator)
		0x2758			=>  '|',				// Light vertical bar
		0x2759			=>  '|',				// Medium vertical bar
		0x2E3A			=>  '-',				// Two-EM dash
		0x2E3B			=>  '-',				// Three-EM dash
		0x3000			=>  ' ',				// Ideographic space
		0x301D			=>  '"',				// Reversed double prime quotation mark
		0x301E			=>  '"',				// Double prime quotation map,
		0x301F			=>  '"',				// Low double prime quotation mark
		0xA728			=>  'TZ',				// TZ with ligature
		0xA729			=>  'tz',				// tz with ligature
		0xA732			=>  'AA',				// AA with ligature
		0xA733			=>  'aa',				// aa with ligature
		0xA734			=>  'AO',				// AO with ligature
		0xA735			=>  'ao',				// ao with ligature
		0xA736			=>  'AU',				// AU with ligature
		0xA737			=>  'au',				// au with ligature
		0xA738			=>  'AV',				// AV with ligature
		0xA739			=>  'av',				// av with ligature
		0xA73A			=>  'AV',				// AV with ligature and bar
		0xA73B			=>  'av',				// av with ligature and bar
		0xA73C			=>  'AY',				// AY with ligature
		0xA73D			=>  'ay',				// ay with ligature
		0xA74E			=>  'OO',				// OO with ligature
		0xA74F			=>  'oo',				// oo with ligature
		0xA760			=>  'VY',				// VY with ligature
		0xA761			=>  'vy',				// vy with ligature
		0xFB00			=>  'ff',				// ff with ligature
		0xFB01			=>  'fi',				// fi with ligature
		0xFB02			=>  'fl',				// fl with ligature
		0xFB03			=>  'ffi',				// ffi with ligature
		0xFB04			=>  'ffl',				// ffl with ligature
		0xFB05			=>  'ft',				// ft with ligature
		0xFB06			=>  'st',				// st with ligature
		0xFF08			=>  '(',
		0xFF09			=>  ')',
		0xFE31			=>  '|',				// Vertical em dash
		0xFE32			=>  '|',				// Vertical en dash
		0xFE58			=>  '-',				// Small em dash
		0xFE63			=>  '-',				// Small ASCII hyphen
		0xFF02			=>  '"',				// Full width quotation mark
		0xFF07			=>  "'",				// Full width apostrophe
		0xFF0D			=>  '-',				// Full-width hyphen variant of ascii hyphen
		0xFEFF			=>  ' ',				// Zero-width non-breaking space
	    ) ;

For more information send a message to info at phpclasses dot org.