生活随笔
收集整理的這篇文章主要介紹了
JAVA版StarDict星际译王简单实现
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
由胡正開發的星際譯王是Linux平臺上很強大的一個開源的翻譯軟件(也有Windows版本的)支持多種詞庫、多種語言版本。尤其詞庫設計比較合理。之前看到一篇博文《星際譯王詞庫應用-自制英漢詞典》中用簡短的程序就實現了詞典的基本功能,不過那個是Linux 下的C/C++版本的,于是決定參考移植一個JAVA版本。 import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.*; /** * {@docRoot} *?Java版詞典測試版,可以在控制臺下輸入要查詢的單詞,回車后會給出單詞在詞典中的釋義 * 詞典采用星際譯王的詞典,本程序主要針對英漢詞典 *? * @author menglongbor * @updateDate 2012-12-31 * @version v1.0.0 *? * 相關參考鏈接: * http://blog.chinaunix.net/uid-20454005-id-1675913.html * http://hi.baidu.com/sean_zhu_xiang/item/1581342f88be430e73863eee * http://blog.csdn.net/ranxiedao/article/details/7787342 * http://www.stardict.cn/ * http://www.huzheng.org/ * http://code.google.com/p/stardict-3/downloads/list *? */ public class testdict { final static intMAX_WORD= 256;// 最長輸入單詞字符數 final static intMAX_KEYS= 27;// 26個字母+"-"開頭的后綴 final static intSIZEINT= 4; final static StringKEY[]= {// 26個字母索引+"-"開頭的后綴,不區分大小寫 "A", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "-" }; public static InputStreamisidx= null;// 讀取idx文件時所要的流 public static InputStreamisdict= null;// 讀取dict文件時所要的流 public static longSTREAM_LOCAL= 0;// 記錄單詞索引在文件流中的位置 public static StringidxfileString= "oxford-gb.idx";// idx文件路徑 public static StringdictfileString= "oxford-gb.dict";// dict文件路徑 /** * 從idx文件中獲取當前目標單詞 * @param word_buf 保存的是c/c++字符串數組轉換為JAVA字符串 * @param data_poffset 用來保存單詞的data偏移位置信息 * @param data_plength 用來保存單詞的data長度信息 * @param len * @return */ public static boolean get_word(String[] word_buf, int[] data_poffset, int[] data_plength, int[] len) { // int len = 0; boolean flag = true; len[0] = 0; int index = -1; byte wd[] = new byte[MAX_WORD]; int value = 0; try { // 讀取單詞,對每個字母開頭的單詞都進行搜索,最多考慮256個字符的單詞, // 讀到單詞結束符\0時賦值表達式的值就不滿足while條件而退出 while (true) { index = isidx.read(); STREAM_LOCAL++;// 每讀取一次,位置標識加一以記錄下單詞在文件流中的起始位置 if (index == -1) { // isidx.reset(); flag = false; break; } if ((index != 0) && (len[0] < MAX_WORD)) { wd[len[0]] = (byte) index;// 將int轉換為byte len[0]++; } else { break; } } // 轉換為JAVA字符串 // 此處不用再需要像c/c++那樣去掉了最后那個結束符了 byte wd2[] = new byte[len[0]]; for (int i = 0; i < len[0]; i++) { wd2[i] = wd[i]; } word_buf[0] = new String(wd2); // System.out.println("get_word:"+word_buf[0]+" len:"+len[0]); // wd = null;// 釋放內存 // wd2 = null; // 讀取偏移量值 for (int i = 0; i < SIZEINT; i++) { // 將4個byte轉換為int int shift = (4 - 1 - i) * 8; index = isidx.read(); STREAM_LOCAL++;// 每讀取一次,位置標識加一以記錄下單詞在文件流中的起始位置 if (index == -1) { // isidx.reset(); flag = false; return flag; } value += (index & 0x00FF) << shift; } data_poffset[0] = value; // 讀取區塊大小值 value = 0; for (int i = 0; i < SIZEINT; i++) { // 將4個byte轉換為int int shift = (4 - 1 - i) * 8; index = isidx.read(); STREAM_LOCAL++;// 每讀取一次,位置標識加一以記錄下單詞在文件流中的起始位置 if (index == -1) { // isidx.reset(); flag = false; return flag; } value += (index & 0x00FF) << shift; } data_plength[0] = value; } catch (Exception e) { System.out.println("idx file read error!"); } // System.out.println("Now local is:"+STREAM_LOCAL); // 得到單詞字符長度 return flag; } /** * 通過偏移位置offset和長度length 來從dict文件中獲取data內容UTF-8編碼的字符 * @param offset 要讀取的內容的起始偏移,為字節數 * @param length 要讀取的內容的數據塊大小,為字節數 * @return 字節數組的data int */ public static byte[] get_data(int[] offset, int[] length) { long oft = offset[0]; long len = length[0]; long skip; byte data_buf[] = new byte[length[0]]; System.out.println("This word's" + "offset:" + offset[0] + "len:" + length[0]); try { isdict.reset(); long valuedata = isdict.available(); if (valuedata < oft + len) { System.out.println("No so much value data! " + valuedata); } // skip=isdict.skip(oft); skip = skipBytesFromStream(isdict, oft); if (skip != oft) { System.out.println("Skip" + skip + " dict file error!"); } if (isdict.read(data_buf) == -1) { System.out.println("Arrive at the end of file!"); } // // Unicode // StringBuffer sb = new StringBuffer(); // // int size =isdict.read(data_buf); // // for (int j = 0; j < size;) // { // // int l = data_buf[j++]; // // int h = data_buf[j++]; // // char c = (char) ((l & 0xff) | ((h << 8) & 0xff00)); // // sb.append(c); // // } // // // return sb.toString(); } catch (Exception e) { data_buf = null; System.out.println("dict file read error!"); e.printStackTrace(); } if (data_buf == null) { return null; } return data_buf; } /** * utf8解碼 參考自http://hi.baidu.com/leo10086/item/d6853813373b19001994ec24 用法: * 假如 newContent 為UTF8編碼的字符串 byte[] b = newContent.getBytes(); newContent = * URLEncoder.UTF8Decode( b, 0, b.length ); * @param in 要進行解碼的UTF8編碼的字節數組 * @param offset * @param length * @return */ public static String UTF8Decode(byte in[], int offset, int length) { StringBuffer buff = new StringBuffer(); int max = offset + length; for (int i = offset; i < max; i++) { char c = 0; if ((in[i] & 0x80) == 0) { c = (char) in[i]; } else if ((in[i] & 0xe0) == 0xc0) // 11100000 { c |= ((in[i] & 0x1f) << 6); // 00011111 i++; c |= ((in[i] & 0x3f) << 0); // 00111111 } else if ((in[i] & 0xf0) == 0xe0) // 11110000 { c |= ((in[i] & 0x0f) << 12); // 00001111 i++; c |= ((in[i] & 0x3f) << 6); // 00111111 i++; c |= ((in[i] & 0x3f) << 0); // 00111111 } else if ((in[i] & 0xf8) == 0xf0) // 11111000 { c |= ((in[i] & 0x07) << 18); // 00000111 (move 18, not 16?) i++; c |= ((in[i] & 0x3f) << 12); // 00111111 i++; c |= ((in[i] & 0x3f) << 6); // 00111111 i++; c |= ((in[i] & 0x3f) << 0); // 00111111 } else { c = ' '; } buff.append(c); } return buff.toString(); } public static byte[] UTF8Encode(String str) { ByteArrayOutputStream bos = new ByteArrayOutputStream(); try { int strlen = str.length(); for (int i = 0; i < strlen; i++) { char t = str.charAt(i); int c = 0; c |= (t & 0xffff); if (c >= 0 && c < 0x80) { bos.write((byte) (c & 0xff)); } else if (c > 0x7f && c < 0x800) { bos.write((byte) (((c >>> 6) & 0x1f) | 0xc0)); bos.write((byte) (((c >>> 0) & 0x3f) | 0x80)); } else if (c > 0x7ff && c < 0x10000) { bos.write((byte) (((c >>> 12) & 0x0f) | 0xe0)); // <-- // correction // (mb) bos.write((byte) (((c >>> 6) & 0x3f) | 0x80)); bos.write((byte) (((c >>> 0) & 0x3f) | 0x80)); } else if (c > 0x00ffff && c < 0xfffff) { bos.write((byte) (((c >>> 18) & 0x07) | 0xf0)); bos.write((byte) (((c >>> 12) & 0x3f) | 0x80)); bos.write((byte) (((c >>> 6) & 0x3f) | 0x80)); bos.write((byte) (((c >>> 0) & 0x3f) | 0x80)); } } bos.flush(); } catch (Exception e) { } return bos.toByteArray(); } /** * 將UTF-8字節數據轉化為Unicode字符串 *? * @param utf_data * ? ? ? ? ? ?byte[] - UTF-8編碼字節數組 * @param len * ? ? ? ? ? ?int - 字節數組長度 * @return String - 變換后的Unicode編碼字符串 */ public static String UTF2Uni(byte[] utf_data, int len) { StringBuffer unis = new StringBuffer(); char unic = 0; int ptr = 0; int cntBits = 0; for (; ptr < len;) { cntBits = getCntBits(utf_data[ptr]); if (cntBits == -1) { ++ptr; continue; } else if (cntBits == 0) { unic = UTFC2UniC(utf_data, ptr, cntBits); ++ptr; } else { unic = UTFC2UniC(utf_data, ptr, cntBits); ptr += cntBits; } unis.append(unic); } return unis.toString(); } /** * 將指定的UTF-8字節組合成一個Unicode編碼字符 * @param utf byte[] - UTF-8字節數組 * @param sptr int - 編碼字節起始位置 * @param cntBits int - 編碼字節數 * @return char - 變換后的Unicode字符 */ public static char UTFC2UniC(byte[] utf, int sptr, int cntBits) { /* * Unicode <-> UTF-8 U-00000000 - U-0000007F: 0xxxxxxx U-00000080 - * U-000007FF: 110xxxxx 10xxxxxx U-00000800 - U-0000FFFF: 1110xxxx * 10xxxxxx 10xxxxxx U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx * 10xxxxxx U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx * 10xxxxxx U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx * 10xxxxxx 10xxxxxx */ int uniC = 0; // represent the unicode char byte firstByte = utf[sptr]; int ptr = 0; // pointer 0 ~ 15 // resolve single byte UTF-8 encoding char if (cntBits == 0) return (char) firstByte; // resolve the first byte firstByte &= (1 << (7 - cntBits)) - 1; // resolve multiple bytes UTF-8 encoding char(except the first byte) for (int i = sptr + cntBits - 1; i > sptr; --i) { byte utfb = utf[i]; uniC |= (utfb & 0x3f) << ptr; ptr += 6; } uniC |= firstByte << ptr; return (char) uniC; } /** * 根據給定字節計算UTF-8編碼的一個字符所占字節數,UTF-8規則定義,字節標記只能為0或2~6 * @param b * @return */ private static int getCntBits(byte b) { int cnt = 0; if (b == 0) return -1; for (int i = 7; i >= 0; --i) { if (((b >> i) & 0x1) == 1) ++cnt; else break; } return (cnt > 6 || cnt == 1) ? -1 : cnt; } /** * 顯示data內容 * @param data_buf UTF-8的單詞釋義數組 * @param data_length UTF-8的單詞釋義數組長度 */ public static void display_data(byte[] data_buf, int data_length[]) { // 將UTF-8byte字節數組轉為當前環境字符并顯示 // String tempString = UTF8Decode(data_buf, 0, data_length[0]); String tempString = UTF2Uni(data_buf, data_length[0]); // String tempString = new String(data_buf); data_buf = null; System.out.println(tempString); } /** * 從idx文件中搜索由word指定的單詞,并保存相應的偏移和長度信息 * @param word * @param data_poffset * @param data_plength * @return 是否搜索成功 */ public static boolean search_word(String word, int[] data_poffset, int[] data_plength) { String wd[] = new String[1]; boolean temp = false; int len[] = new int[1]; // 從idx文件中獲取當前目標單詞 // for (get_word(wd, data_poffset, data_plength); end; get_word(wd, // data_poffset, data_plength)) // { while (get_word(wd, data_poffset, data_plength, len)) { // System.out.println("compared_word:"+wd[0]); // if (wd[0].compareToIgnoreCase(word) == 0) // // 比較字符串s1和s2,但不區分字母的大小寫 if (strsEqualsIgnoreCase(wd[0], word) == 0) { System.out.println("compared_word:" + word + " " + wd[0]); temp = true; break; } } return temp; } /** * 從標準輸入獲取待查詢的單詞,控制臺下為GBK字符,字典索引中的英文單詞字母也是如此 * @param max_len * @param count * @return */ public static String get_input(int max_len, int[] count) { byte input_buf[] = new byte[max_len]; count[0] = 0; String tempString[] = new String[1]; try { count[0] = System.in.read(input_buf) - 2;// 返回實際讀取到的字符數,減去2個控制字符 byte temp_buf[] = new byte[count[0]]; for (int i = 0; i < count[0]; i++) { temp_buf[i] = input_buf[i]; } tempString[0] = new String(temp_buf); } catch (Exception e) { System.out.println("Input error!"); } System.out.println("Your input is:" + tempString[0]); return tempString[0]; } /** * 從標準輸入獲取待查詢的單詞,控制臺下為GBK字符,字典索引中的英文單詞字母也是如此 * @param input_buf * @param count * @return */ public static byte[] get_input(byte[] input_buf, int[] count) { try { count[0] = System.in.read(input_buf) - 2;// 返回實際讀取到的字符數,減去2個控制字符 } catch (Exception e) { input_buf = null; System.out.println("Input error!"); } return input_buf; } /** * 緩存KEYS在idx中的偏移信息,以便加快search_word的搜索速度 * @param idx_cache 保存每個單字母單詞對應的起始位置 * @return */ public static void cache_idx(long[] idx_cache) { int i; long[] p = idx_cache; int unused1[] = new int[1]; int unused2[] = new int[1]; try { // 將文件內部的位置指針重新指向一個流(數據流/文件)的開頭返回FILE指針當前位置, // 然后重新遍歷整個文件搜尋下一個字母開頭的單詞 isidx.reset(); STREAM_LOCAL = 0; for (i = 0; i < MAX_KEYS; i++) { // System.out.println("Start search_word:" + KEY[i]); if (search_word(KEY[i], unused1, unused2))// 從idx文件中搜索由word指定的單詞,并保存相應的偏移和長度信息 { p[i] = STREAM_LOCAL; // 返回當前文件位置 // String tempString = Long.toString(STREAM_LOCAL); // System.out.println(KEY[i] + "'s local is:" + tempString); System.out.println(KEY[i] + "'s local is:" + STREAM_LOCAL + " offset:" + unused1[0] + "length:" + unused2[0]); } else p[i] = 0; } // isidx.reset(); } catch (Exception e) { // TODO: handle exception } } /** * 定位由word指定的單詞在idx文件中的大概偏移位置 * @param word * @param idx_cache * @return */ public static long locate_idx(String word, long[] idx_cache) { int i = 0; int pre = 0; String tempString = word.toLowerCase(); while (i < MAX_KEYS && KEY[i].charAt(0) < tempString.charAt(0)) { pre = i; ++i; } if (tempString.charAt(0) == '-') { pre = 0; } System.out.println("Now word's locate is:" + idx_cache[pre]); return idx_cache[pre]; } /** * 主要查詢函數 */ public static void consult() { byte data[] = null;// 釋義數據,UTF-8數據 long idx[] = new long[MAX_KEYS];// 26個字母孤立單詞+"-"開頭的后綴對應的索引緩沖 int offset[] = new int[1]; int length[] = new int[1]; System.out.println("Start cache_idx....!"); try { System.out.println("Open files....!"); // 讀取字典索引文件 isidx = new BufferedInputStream(new FileInputStream( idxfileString)); isidx.mark(isidx.available() + 1); if (!isidx.markSupported()) { System.out.println("This stream do not support mark....!"); } } catch (Exception e) { System.out.println("Open files error!"); e.printStackTrace(); } cache_idx(idx);// 緩存KEYS在idx中的偏移信息,以便加快search_word的搜索速度 try { isdict = new BufferedInputStream(new FileInputStream( dictfileString)); isdict.mark(isdict.available() + 1); if (!isdict.markSupported()) { System.out.println("This stream do not support mark....!"); } } catch (Exception e) { System.out.println("Open files error!"); e.printStackTrace(); } while (true) { System.out.println("INPUT A WORD OR PHRASE: "); int count[] = new int[1]; String word = get_input(MAX_WORD, count); long skips1, skips2; if (count[0] > 0)// 從控制臺得到輸入單詞字符 { try { // 從文件開頭跳到單詞大致索引所在位置 // isidx.mark(0); isidx.reset(); skips1 = locate_idx(word, idx); // skips2 = isidx.skip(skips1); skips2 = skipBytesFromStream(isidx, skips1); System.out .println("skips1:" + skips1 + " skips2:" + skips2); } catch (Exception e) { System.out.println("locate_idx run error"); e.printStackTrace(); } if (search_word(word, offset, length)) { data = get_data(offset, length); display_data(data, length); data = null; } else System.out.println("SORRY " + word + " CANNOT BE FOUND!\n"); System.out .println("\n----------------------------------------\n\n"); } else break; } } /** * 不區分大小寫比較兩個字符串 *? * @param s1 * @param s2 * @return */ public static int strsEqualsIgnoreCase(String s1, String s2) { int n1 = s1.length(), n2 = s2.length(); for (int i1 = 0, i2 = 0; i1 < n1 && i2 < n2; i1++, i2++) { char c1 = s1.charAt(i1); char c2 = s2.charAt(i2); if (c1 != c2) { // 源字符串全部都轉為大寫字符串 c1 = Character.toUpperCase(c1); c2 = Character.toUpperCase(c2); if (c1 != c2) { // 源字符串全部都轉為小寫字符串 c1 = Character.toLowerCase(c1); c2 = Character.toLowerCase(c2); if (c1 != c2) { return c1 - c2; } } } } return n1 - n2;// 如果其中一個或者兩個String都比較完了還沒有同樣的char的話,那就return兩個String的長度差距 } /** * 重寫了Inpustream 中的skip(long n) 方法,將數據流中起始的n 個字節跳過 * 參考:http://blog.csdn.net/ranxiedao/article/details/7787342 * @param inputStream * @param n * @return */ private static long skipBytesFromStream(InputStream inputStream, long n) { long remaining = n; // SKIP_BUFFER_SIZE is used to determine the size of // skipBuffer int SKIP_BUFFER_SIZE = 2048; // skipBuffer is initialized in // skip(long), if needed. byte[] skipBuffer = null; int nr = 0; if (skipBuffer == null) { skipBuffer = new byte[SKIP_BUFFER_SIZE]; } byte[] localSkipBuffer = skipBuffer; if (n <= 0) { return 0; } while (remaining > 0) { try { nr = inputStream.read(localSkipBuffer, 0, (int) Math.min( SKIP_BUFFER_SIZE, remaining)); } catch (IOException e) { e.printStackTrace(); } if (nr < 0) { break; } remaining -= nr; } return n - remaining; } /** * 主函數 * @param args */ public static void main(String args[]) { consult(); try { isidx.close(); isdict.close(); } catch (Exception e) { System.out.println("Close files error!"); e.printStackTrace(); } } } 如果要在windows平臺下編譯http://blog.chinaunix.net/uid-20454005-id-1675913.html文章中的程序代碼最好保存為cpp文件以C++項目編譯執行,而且strcasecmp函數應該換為stricmp函數,并且上面作者原來的程序是在linux平臺下的,字符編碼本身就是UTF8的不需要進行編碼轉換,但在windows平臺下中文為gb232編碼,就需要進行編碼的轉換,下面為需要添加修改上的字符編碼轉換后的程序。 //UTF-8到GB2312的轉換 char* U2G(const char* utf8) { int len = MultiByteToWideChar(CP_UTF8, 0, utf8, -1, NULL, 0); wchar_t* wstr = new wchar_t[len+1]; memset(wstr, 0, len+1); MultiByteToWideChar(CP_UTF8, 0, utf8, -1, wstr, len); len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL); char* str = new char[len+1]; memset(str, 0, len+1); WideCharToMultiByte(CP_ACP, 0, wstr, -1, str, len, NULL, NULL); if(wstr) delete[] wstr; return str; } //GB2312到UTF-8的轉換 char* G2U(const char* gb2312) { int len = MultiByteToWideChar(CP_ACP, 0, gb2312, -1, NULL, 0); wchar_t* wstr = new wchar_t[len+1]; memset(wstr, 0, len+1); MultiByteToWideChar(CP_ACP, 0, gb2312, -1, wstr, len); len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL); char* str = new char[len+1]; memset(str, 0, len+1); WideCharToMultiByte(CP_UTF8, 0, wstr, -1, str, len, NULL, NULL); if(wstr) delete[] wstr; return str; } /* * 顯示data內容 */ void display_data(char *data_buf, unsigned int data_length) { fwrite(data_buf,data_length,1,stdout); char *data=(char *)malloc(data_length); memcpy(data,data_buf,data_length); char *p=U2G(data_buf); printf("%s\n",p); free(data); delete p; } 以星際譯王所支持的牛津英漢詞典oxford-gb作為測試,詞典格式為UTF8編碼的單詞字符串,然后是四個字節的int型數據表示該單詞在dict釋義文件中的起始偏移量,再后四個字節的int型數據表示dict文件中該單詞釋義總共的長度,如下圖所示:
結果顯示能夠正確得到單詞的釋義,只是音標未能正確解碼,如下圖所示:
轉載于:https://www.cnblogs.com/u0mo5/p/3980508.html
總結
以上是生活随笔為你收集整理的JAVA版StarDict星际译王简单实现的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。