import java.util.Map; import java.util.HashMap; /** * Halfwidth and Fullwidth Character Normalization for CJK * http://solutions.asia * * See the Unicode Standard 6.0 â Halfwidth and Fullwidth Forms * http://unicode.org/charts/PDF/UFF00.pdf * * For Chinese, Japanese and Korean, some characters have Unicode mappings to * both a halfwidth and a fullwidth version. This code normalizes them * to halfwidth for latin characters, numbers and punctuation and fullwidth * for everything else. * Fine for half/full width normalization but not fully equivalent to NFKC * normalization */ public class CJKHalfFullWidthNormalize { private static final Map<Character, Character> charCodeMap; // Key â Original Character // Value â Replacement character static { charCodeMap = new HashMap<Character, Character>(); // TO HALFWIDTH CHARACTERS // ASCII variants (Latin Symbols, Punctuation, Numbers, and Alphabet) for (char key = '\uff01'; key <= '\uff5e'; key++) { char value = (char) (key - '\ufee0'); charCodeMap.put(key, value); } // Brackets charCodeMap.put('\uff5f', '\u2985'); // left white parenthesis charCodeMap.put('\uff60', '\u2986'); // right white parenthesis // Symbol Variants charCodeMap.put('\uffe0', '\u00a2'); // Cent sign charCodeMap.put('\uffe1', '\u00a3'); // Pound sign charCodeMap.put('\uffe2', '\u00ac'); // Not sign charCodeMap.put('\uffe3', '\u00af'); // Macron charCodeMap.put('\uffe4', '\u00a6'); // Broken Bar charCodeMap.put('\uffe5', '\u00a5'); // Yen sign charCodeMap.put('\uffe6', '\u20a9'); // Won sign // Space (strictly speaking not listed in Unicode 6.0 Halfwidth and // Fullwidth forms but including here as the ideographic space can // cause issues) charCodeMap.put('\u3000', '\u0020'); // SPACE // TO FULLWIDTH CHARACTERS // CJK punctuation charCodeMap.put('\uff61', '\u3002'); // ideographic full stop charCodeMap.put('\uff62', '\u300c'); // left corner bracket charCodeMap.put('\uff63', '\u300d'); // right corner bracket charCodeMap.put('\uff64', '\u3001'); // ideographic comma // Katakana variants charCodeMap.put('\uff65', '\u30fb'); // Middle Dot charCodeMap.put('\uff66', '\u30f2'); // Wo charCodeMap.put('\uff67', '\u30a1'); // A small charCodeMap.put('\uff68', '\u30a3'); // I small charCodeMap.put('\uff69', '\u30a5'); // U small charCodeMap.put('\uff6a', '\u30a7'); // E small charCodeMap.put('\uff6b', '\u30a9'); // O small charCodeMap.put('\uff6c', '\u30e3'); // Ya small charCodeMap.put('\uff6d', '\u30e5'); // Yu small charCodeMap.put('\uff6e', '\u30e7'); // Yo small charCodeMap.put('\uff6f', '\u30c3'); // Tsu small charCodeMap.put('\uff70', '\u30fc'); // Prolonged Sound Mark charCodeMap.put('\uff71', '\u30a2'); // A charCodeMap.put('\uff72', '\u30a4'); // I charCodeMap.put('\uff73', '\u30a6'); // U charCodeMap.put('\uff74', '\u30a8'); // E charCodeMap.put('\uff75', '\u30aa'); // O charCodeMap.put('\uff76', '\u30ab'); // Ka charCodeMap.put('\uff77', '\u30ad'); // Ki charCodeMap.put('\uff78', '\u30af'); // Ku charCodeMap.put('\uff79', '\u30b1'); // Ke charCodeMap.put('\uff7a', '\u30b3'); // Ko charCodeMap.put('\uff7b', '\u30b5'); // Sa charCodeMap.put('\uff7c', '\u30b7'); // Shi charCodeMap.put('\uff7d', '\u30b9'); // Su charCodeMap.put('\uff7e', '\u30bb'); // Se charCodeMap.put('\uff7f', '\u30bd'); // So charCodeMap.put('\uff80', '\u30bf'); // Ta charCodeMap.put('\uff81', '\u30c1'); // Chi charCodeMap.put('\uff82', '\u30c4'); // Tsu charCodeMap.put('\uff83', '\u30c6'); // Te charCodeMap.put('\uff84', '\u30c8'); // To charCodeMap.put('\uff85', '\u30ca'); // Na charCodeMap.put('\uff86', '\u30cb'); // Ni charCodeMap.put('\uff87', '\u30cc'); // Nu charCodeMap.put('\uff88', '\u30cd'); // Ne charCodeMap.put('\uff89', '\u30ce'); // No charCodeMap.put('\uff8a', '\u30cf'); // Ha charCodeMap.put('\uff8b', '\u30d2'); // Hi charCodeMap.put('\uff8c', '\u30d5'); // Hu charCodeMap.put('\uff8d', '\u30d8'); // He charCodeMap.put('\uff8e', '\u30db'); // Ho charCodeMap.put('\uff8f', '\u30de'); // Ma charCodeMap.put('\uff90', '\u30df'); // Mi charCodeMap.put('\uff91', '\u30e0'); // Mu charCodeMap.put('\uff92', '\u30e1'); // Me charCodeMap.put('\uff93', '\u30e2'); // Mo charCodeMap.put('\uff94', '\u30e4'); // Ya charCodeMap.put('\uff95', '\u30e6'); // Yu charCodeMap.put('\uff96', '\u30e8'); // Yo charCodeMap.put('\uff97', '\u30e9'); // Ra charCodeMap.put('\uff98', '\u30ea'); // Ri charCodeMap.put('\uff99', '\u30eb'); // Ru charCodeMap.put('\uff9a', '\u30ec'); // Re charCodeMap.put('\uff9b', '\u30ed'); // Ro charCodeMap.put('\uff9c', '\u30ef'); // Wa charCodeMap.put('\uff9d', '\u30f3'); // N charCodeMap.put('\uff9e', '\u3099'); // Voiced Sound Mark charCodeMap.put('\uff9f', '\u309a'); // Semi-Voiced Sound Mark // Hangul variants charCodeMap.put('\uffa0', '\u3164'); // Hangul Filler // Hangul First Range // KIYEOK to HIEUH for (char key = '\uffa1'; key <= '\uffbe'; key++) { char value = (char) (key - '\uce70'); charCodeMap.put(key, value); } // Hangul Second Range // A to E for (char key = '\uffc2'; key <= '\uffc7'; key++) { char value = (char) (key - '\uce73'); charCodeMap.put(key, value); } // Hangul Third Range // YEO to OE for (char key = '\uffca'; key <= '\uffcf'; key++) { char value = (char) (key - '\uce75'); charCodeMap.put(key, value); } // Hangul Fourth Range // YO to YU for (char key = '\uffd2'; key <= '\uffd7'; key++) { char value = (char) (key - '\uce77'); charCodeMap.put(key, value); } // More Hangul variants charCodeMap.put('\uffda', '\u3161'); // Hangul EU charCodeMap.put('\uffdb', '\u3162'); // Hangul YI charCodeMap.put('\uffdc', '\u3163'); // Hangul I // Symbol Variants charCodeMap.put('\uffe8', '\u2502'); // Forms Light Vertical charCodeMap.put('\uffe9', '\u2190'); // Leftwards Arrow charCodeMap.put('\uffea', '\u2191'); // Upwards Arrow charCodeMap.put('\uffeb', '\u2192'); // Rightwards Arrow charCodeMap.put('\uffec', '\u2193'); // Downwards Arrow charCodeMap.put('\uffed', '\u25a0'); // Black Square charCodeMap.put('\uffee', '\u25cb'); // White Circle } /** * Takes an unnormalized (Halfwidth/Fullwidth) and outputs a normalized string */ public static void main(String[] args) { String unnormalized = args[0]; System.out.println("Unnormalized:\t " + unnormalized); char[] buffer = unnormalized.toCharArray(); int bufferLen = buffer.length; for (int i = 0; i < bufferLen; i++) { if (charCodeMap.containsKey(buffer[i])) { buffer[i] = charCodeMap.get(buffer[i]); } } System.out.println("Normalized:\t " + new String(buffer)); } }