一个用于读unicode文本的迭代器(iterator)
written by chenghuige at gmail.com
也需有更好的方法我沒有想到,但是沒有在linux下面找到比較方便的讀取unicode文本的方法。
用ICU,QT都太重量級了,于是自己寫了一個包裝好的unicode_iterator,當然還可以進一步
包裝比如提出一個類提供begin和end.但是還要考慮很多,比如有的是little edian格式的有的是big edian
格式的,同時有的文本可能并沒有標準的開頭表明它是unicode格式的,以及ittle edian 還是big edian,
需要用戶自己指出。
當前初始化給定一個std::filebuf,然后每次*iter提取的是一個UTF16格式的字符,定義
typedef unsigned short UTF16
注意沒有用wchar_t因為這個在windows下是2byte但是在linux下GCC默認是4byte,為UTF32準備的。
迭代器用boost iterator幫助簡化書寫,類似的我們也可以寫出給定一個utf8或者GBK等等編碼的流轉換
到unicode(UTF16)的iterator,內部統一用unicode處理數據還是比較方便的,C++0X也出現了u16string.
?
用法示意:假設是一個標準的litle edian 的 unicode文本
?
代碼 ?1??2???using?namespace?std;
?3???using?namespace?glseg;
?4?
?5???ifstream?istr(infilename.c_str());
?6???filebuf*?pbuf?=?istr.rdbuf();
?7???
?8???unsigned?char?ch??=?pbuf->sbumpc();
?9???unsigned?char?ch1?=?pbuf->sbumpc();
10?
11???if?(ch?==?encoding_type[Utf16LittleEndian][0]?&&?ch1?==?encoding_type[Utf16LittleEndian][1])?
12?????cout?<<?"The?encoding?of?this?file?is??utf16?little?endian"?<<?endl;
13???if?(ch?==?encoding_type[Utf16BigEndian][0]?&&?ch1?==?encoding_type[Utf16BigEndian][1])
14?????cout?<<?"The?encoding?of?this?file?is??utf16?big?endian"?<<?endl;
15?
16???unicode_iterator<>?first(pbuf);
17???unicode_iterator<>?end;
18???
19???UTF8?utf8_array[4];
20???for?(;first?!=?end;?++first)?{
21?????unicode2utf8(*first,?utf8_array);?//將utf16字符轉換到utf8,這樣用cout就可以顯示了因為默認都是utf8
22?????cout?<<?utf8_array;
23???}
?
?
?
代碼 ?1?/**??2??*??==============================================================================
?3??*?
?4??*??????????\file???type.h
?5??*
?6??*????????\author???chenghuige?at?gmail.com
?7??*??????????
?8??*??????????\date???2009-12-09?19:02:16.223408
?9??*??
10??*???Description:???differnt?tyes?declaraion
11??*??==============================================================================
12??*/
13?
14?#ifndef?TYPE_H_
15?#define?TYPE_H_
16?
17?namespace?glseg?{
18?
19?typedef?unsigned?long???UTF32;??/*?at?least?32?bits?*/
20?typedef?unsigned?short??UTF16;??/*?at?least?16?bits?*/
21?typedef?unsigned?char???UTF8;???/*?typically?8?bits?*/
22?typedef?unsigned?char???Boolean;?/*?0?or?1?*/
23?
24?enum?EncodingType?{
25?????Unknown,
26?????Utf16LittleEndian,??//?Default?on?Windows
27?????Utf16BigEndian,
28?????Utf8,
29?????encoding_num
30?};
31?
32?const?UTF8?encoding_type[encoding_num][3]?=
33?{
34?????{0x00,?0x00,?0x00},??//?Unknown
35?????{0xFF,?0xFE,?0x00},??//?Little?endian
36???{0xFE,?0xFF,?0x00},??//?Big?endian
37?????{0xEF,?0xBB,?0xBF},?//?UTF8
38?};
39?
40?}??//----end?of?namespace?glseg
41?
42?#endif??//----end?of?TYPE_H_
?
?
?
代碼 ??1?/**???2??*??==============================================================================
??3??*?
??4??*??????????\file???unicode_iterator.h
??5??*
??6??*????????\author???chenghuige?at?gmail.com
??7??*??????????
??8??*??????????\date???2009-12-09?16:56:59.395999
??9??*??
?10??*???Description:???An?unicode?iterator?for?unicode?encoding?file
?11??*??==============================================================================
?12??*/
?13?
?14?#ifndef?UNICODE_ITERATOR_H_
?15?#define?UNICODE_ITERATOR_H_
?16?
?17?#include?<boost/iterator/iterator_facade.hpp>
?18?#include?<fstream>
?19?#include?"type.h"?
?20?
?21?namespace?glseg?{
?22?
?23?//this?is?mainly?for?file?because?when?we?read?the?file
?24?//we?can?not?read?two?bytes?and?decide?the?little?or
?25?//big?edian?easy,?this?class?will?help
?26?//Perhaps?name?it?as?unicode?or?utf16?fstream?buf?itreator?is?better:)
?27?
?28?//the?big?endian?case
?29?template?<bool?isLittleEndian>
?30?struct?UTF8_2_UTF16?{
?31???static?void?convert(const?UTF8?ch1,?const?UTF8?ch2,?UTF16*?result)?{
?32?????*result?=?ch1;???
?33?????*result?<<=?8;
?34?????*result?|=?ch2;
?35???}
?36?};
?37?
?38?//the?little?endian?case
?39?template?<>
?40?struct?UTF8_2_UTF16<true>?{
?41???static?void?convert(const?UTF8?ch1,?const?UTF8?ch2,?UTF16*?result)?{
?42?????*result?=?ch2;???
?43?????*result?<<=?8;
?44?????*result?|=?ch1;
?45???}
?46?};
?47?
?48?
?49?template<bool?isLittleEndian?=?true>
?50?class?unicode_iterator
?51???:?public?boost::iterator_facade<
?52???????unicode_iterator<isLittleEndian>
?53?????,?UTF16
?54?????,?boost::forward_traversal_tag
?55?????,?UTF16&
?56?????>
?57?{
?58?public:
?59???unicode_iterator()
?60?????:?pbuf_(0),?valid_(false)?{}
?61?
?62???explicit?unicode_iterator(std::filebuf*?pbuf)?
?63?????:pbuf_(pbuf),?valid_(true)?{?increment();?}?//need?to?be?ready?for?first?*iter
?64?
?65?private:
?66???friend?class?boost::iterator_core_access;
?67?
?68???void?increment()?{?
?69?????if?(pbuf_->sgetc()!=EOF)?{
?70???????ch1_?=?pbuf_->sbumpc();
?71???????ch2_?=?pbuf_->sbumpc();
?72???????UTF8_2_UTF16<isLittleEndian>::convert(ch1_,?ch2_,?&result_);
?73?????}?
?74?????else?{
?75???????valid_?=?false;
?76?????}
?77???}
?78?
?79???bool?equal(unicode_iterator<isLittleEndian>?const&?other)?const?{
?80?????return?(other.valid_?&&?valid_)
?81?????????????(other.pbuf_?==?pbuf_)
?82???????????:(other.valid_?==?valid_);
?83???}
?84?
?85???UTF16&?dereference()?const?{?
?86?????return?result_;
?87???}
?88?
?89?private:
?90???std::filebuf*?pbuf_;
?91???bool?valid_;
?92?
?93???UTF8??ch1_;
?94???UTF8??ch2_;
?95???mutable?UTF16?result_;
?96???//FIXME?if?not?mutable
?97???//for?86?error:?invalid?initialization?of?reference?oftype?'glseg::UTF16&'?from?expression?of?type?'const?glseg::UTF16'
?98?};
?99?
100?}??//----end?of?namespace?glseg
101?
102?#endif??//----end?of?UNICODE_ITERATOR_H_
?
?
轉載于:https://www.cnblogs.com/rocketfan/archive/2009/12/09/1620604.html
與50位技術專家面對面20年技術見證,附贈技術全景圖總結
以上是生活随笔為你收集整理的一个用于读unicode文本的迭代器(iterator)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 公户的钱怎样合理转出不交税
- 下一篇: 找寻消失的网络连接