精华内容
下载资源
问答
  • utf8 to utf16

    2017-02-22 07:09:40
    1. 问题描述: 将一个utf8 类型的字符串转换为utf16.../** utf8_to_utf16.c */ #define UTF8_END -1 #define UTF8_ERROR -2 typedef struct json_utf8_decode { int the_index; char *the_input; int the_length;

    1.  问题描述: 将一个utf8 类型的字符串转换为utf16 

    /** utf8_to_utf16.c */
    
    #define UTF8_END   -1
    #define UTF8_ERROR -2
    
    typedef struct json_utf8_decode
    {
        int the_index;
        char *the_input;
        int the_length;
        int the_char;
        int the_byte;
    } json_utf8_decode;
    
    extern int  utf8_decode_at_byte(json_utf8_decode *utf8);
    extern int  utf8_decode_at_character(json_utf8_decode *utf8);
    extern void utf8_decode_init(json_utf8_decode *utf8, char p[], int length);
    extern int  utf8_decode_next(json_utf8_decode *utf8);
    
    // utf8_to_utf16
    extern int utf8_to_utf16(unsigned short w[], char p[], int length);
    
    /**
        Very Strict UTF-8 Decoder
    
        UTF-8 is a multibyte character encoding of Unicode. A character can be
        represented by 1-4 bytes. The bit pattern of the first byte indicates the
        number of continuation bytes.
    
        Most UTF-8 decoders tend to be lenient, attempting to recover as much
        information as possible, even from badly encoded input. This UTF-8
        decoder is not lenient. It will reject input which does not include
        proper continuation bytes. It will reject aliases (or suboptimal
        codings). It will reject surrogates. (Surrogate encoding should only be
        used with UTF-16.)
    
        Code     Contination Minimum Maximum
        0xxxxxxx           0       0     127
        10xxxxxx       error
        110xxxxx           1     128    2047
        1110xxxx           2    2048   65535 excluding 55296 - 57343
        11110xxx           3   65536 1114111
        11111xxx       error
    */
    
    
    /**
        Get the next byte. It returns UTF8_END if there are no more bytes.
    */
    static int 
    get(json_utf8_decode *utf8)
    {
        int c;
        if (utf8->the_index >= utf8->the_length) {
            return UTF8_END;
        }
        c = utf8->the_input[utf8->the_index] & 0xFF;
        utf8->the_index += 1;
        return c;
    }
    
    
    /**
        Get the 6-bit payload of the next continuation byte.
        Return UTF8_ERROR if it is not a contination byte.
    */
    static int 
    cont(json_utf8_decode *utf8)
    {
        int c = get(utf8);
        return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
    }
    
    
    /**
        Initialize the UTF-8 decoder. The decoder is not reentrant,
    */
    void 
    utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
    {
        utf8->the_index = 0;
        utf8->the_input = p;
        utf8->the_length = length;
        utf8->the_char = 0;
        utf8->the_byte = 0;
    }
    
    
    /**
        Get the current byte offset. This is generally used in error reporting.
    */
    int 
    utf8_decode_at_byte(json_utf8_decode *utf8)
    {
        return utf8->the_byte;
    }
    
    
    /**
        Get the current character offset. This is generally used in error reporting.
        The character offset matches the byte offset if the text is strictly ASCII.
    */
    int 
    utf8_decode_at_character(json_utf8_decode *utf8)
    {
        return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
    }
    
    
    /**
        Extract the next character.
        Returns: the character (between 0 and 1114111)
             or  UTF8_END   (the end)
             or  UTF8_ERROR (error)
    */
    int 
    utf8_decode_next(json_utf8_decode *utf8)
    {
        int c;  /** the first byte of the character */
        int r;  /** the result */
    
        if (utf8->the_index >= utf8->the_length) {
            return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
        }
        utf8->the_byte = utf8->the_index;
        utf8->the_char += 1;
        c = get(utf8);
    /**
        Zero continuation (0 to 127)
    */
        if ((c & 0x80) == 0) {
            return c;
        }
    /**
        One contination (128 to 2047)
    */
        if ((c & 0xE0) == 0xC0) {
            int c1 = cont(utf8);
            if (c1 < 0) {
                return UTF8_ERROR;
            }
            r = ((c & 0x1F) << 6) | c1;
            return r >= 128 ? r : UTF8_ERROR;
        }
    /**
        Two continuation (2048 to 55295 and 57344 to 65535) 
    */
        if ((c & 0xF0) == 0xE0) {
            int c1 = cont(utf8);
            int c2 = cont(utf8);
            if (c1 < 0 || c2 < 0) {
                return UTF8_ERROR;
            }
            r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
            return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
        }
    /**
        Three continuation (65536 to 1114111)
    */
        if ((c & 0xF8) == 0xF0) {
            int c1 = cont(utf8);
            int c2 = cont(utf8);
            int c3 = cont(utf8);
            if (c1 < 0 || c2 < 0 || c3 < 0) {
                return UTF8_ERROR;
            }
            r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
            return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
        }
        return UTF8_ERROR;
    }
    
    int 
    utf8_to_utf16(unsigned short w[], char p[], int length) 
    {
        int c;
        int the_index = 0;
        json_utf8_decode utf8;
        
        utf8_decode_init(&utf8, p, length);
        for (;;) {
            c = utf8_decode_next(&utf8);
            if (c < 0) {
                return (c == UTF8_END) ? the_index : UTF8_ERROR;
            }
            if (c < 0x10000) {
                w[the_index] = (unsigned short)c;
                the_index += 1;
            } else {
                c -= 0x10000;
                w[the_index] = (unsigned short)(0xD800 | (c >> 10));
                the_index += 1;
                w[the_index] = (unsigned short)(0xDC00 | (c & 0x3FF));
                the_index += 1;
            }
        }
    }
    
    int main()
    {
    	int 			i, ret = 0;
    	int 			length  = 13;
    	unsigned short 	out_str[13]	= {0};
    	char 			*in_str	= "/home/steven";
    	
    	ret = utf8_to_utf16(out_str, in_str, length);
    	printf("%d ", ret);
    	for (i = 0; i < length; i++)
    	{
    		printf("%c ", out_str[i]);
    	}
    	
    	printf("\n");
    	
    	return 0;
    }
    


    展开全文
  • 本文整理匯總了Java中org.apache.lucene.util.UnicodeUtil.UTF8toUTF16方法的典型用法代碼示例。如果您正苦於以下問題:Java UnicodeUtil.UTF8toUTF16方法的具體用法?Java UnicodeUtil.UTF8toUTF16怎麽用?Java ...

    本文整理匯總了Java中org.apache.lucene.util.UnicodeUtil.UTF8toUTF16方法的典型用法代碼示例。如果您正苦於以下問題:Java UnicodeUtil.UTF8toUTF16方法的具體用法?Java UnicodeUtil.UTF8toUTF16怎麽用?Java UnicodeUtil.UTF8toUTF16使用的例子?那麽恭喜您, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.lucene.util.UnicodeUtil的用法示例。

    在下文中一共展示了UnicodeUtil.UTF8toUTF16方法的26個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於我們的係統推薦出更棒的Java代碼示例。

    示例1: build

    ​點讚 3

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    /**

    * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing

    * strings in UTF-8. These strings must be binary-sorted.

    */

    public static Automaton build(Collection input) {

    final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

    char[] chars = new char[0];

    CharsRef ref = new CharsRef();

    for (BytesRef b : input) {

    chars = ArrayUtil.grow(chars, b.length);

    final int len = UnicodeUtil.UTF8toUTF16(b, chars);

    ref.chars = chars;

    ref.length = len;

    builder.add(ref);

    }

    Automaton.Builder a = new Automaton.Builder();

    convert(a,

    builder.complete(),

    new IdentityHashMap());

    return a.finish();

    }

    開發者ID:lamsfoundation,項目名稱:lams,代碼行數:25,

    示例2: evaluate

    ​點讚 3

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public BytesRef evaluate(Input... args) {

    Object stringValue = args[0].value();

    if (stringValue == null) {

    return null;

    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];

    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);

    charUtils.toLowerCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];

    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);

    return new BytesRef(res, 0, len);

    }

    開發者ID:baidu,項目名稱:Elasticsearch,代碼行數:18,

    示例3: evaluate

    ​點讚 3

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public BytesRef evaluate(Input... args) {

    Object stringValue = args[0].value();

    if (stringValue == null) {

    return null;

    }

    BytesRef inputByteRef = BytesRefs.toBytesRef(stringValue);

    char[] ref = new char[inputByteRef.length];

    int len = UnicodeUtil.UTF8toUTF16(inputByteRef.bytes, inputByteRef.offset, inputByteRef.length, ref);

    charUtils.toUpperCase(ref, 0, len);

    byte[] res = new byte[UnicodeUtil.MAX_UTF8_BYTES_PER_CHAR * len];

    len = UnicodeUtil.UTF16toUTF8(ref, 0, len, res);

    return new BytesRef(res, 0, len);

    }

    開發者ID:baidu,項目名稱:Elasticsearch,代碼行數:18,

    示例4: addTermFrequencies

    ​點讚 3

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    /**

    * Adds terms and frequencies found in vector into the Map termFreqMap

    *

    * @param termFreqMap a Map of terms and their frequencies

    * @param vector List of terms and their frequencies for a doc/field

    */

    private void addTermFrequencies(Map termFreqMap, Terms vector) throws IOException {

    final TermsEnum termsEnum = vector.iterator(null);

    final CharsRef spare = new CharsRef();

    BytesRef text;

    while((text = termsEnum.next()) != null) {

    UnicodeUtil.UTF8toUTF16(text, spare);

    final String term = spare.toString();

    if (isNoiseWord(term)) {

    continue;

    }

    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency

    Int cnt = termFreqMap.get(term);

    if (cnt == null) {

    cnt = new Int();

    termFreqMap.put(term, cnt);

    cnt.x = freq;

    } else {

    cnt.x += freq;

    }

    }

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:30,

    示例5: build

    ​點讚 3

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public void build(TermFreqIterator tfit) throws IOException {

    root = new TernaryTreeNode();

    // buffer first

    if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {

    // make sure it's sorted and the comparator uses UTF16 sort order

    tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());

    }

    ArrayList tokens = new ArrayList();

    ArrayList vals = new ArrayList();

    BytesRef spare;

    CharsRef charsSpare = new CharsRef();

    while ((spare = tfit.next()) != null) {

    charsSpare.grow(spare.length);

    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);

    tokens.add(charsSpare.toString());

    vals.add(Long.valueOf(tfit.weight()));

    }

    autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:22,

    示例6: build

    ​點讚 3

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    /**

    * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing

    * strings in UTF-8. These strings must be binary-sorted.

    */

    public static Automaton build(Collection input) {

    final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

    CharsRef scratch = new CharsRef();

    for (BytesRef b : input) {

    UnicodeUtil.UTF8toUTF16(b, scratch);

    builder.add(scratch);

    }

    Automaton a = new Automaton();

    a.initial = convert(

    builder.complete(),

    new IdentityHashMap());

    a.deterministic = true;

    return a;

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:21,

    示例7: decompressString

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    /** Decompress the byte array previously returned by

    * compressString back into a String */

    public static String decompressString(byte[] value, int offset, int length) throws DataFormatException {

    final byte[] bytes = decompress(value, offset, length);

    final char[] result = new char[bytes.length];

    final int len = UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result);

    return new String(result, 0, len);

    }

    開發者ID:lamsfoundation,項目名稱:lams,代碼行數:9,

    示例8: codePoint

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public int codePoint(int index) {

    //FIXME: is this the correct behaviour?

    this.tmpByte[0] = this.contents.bytes[index];

    UnicodeUtil.UTF8toUTF16( this.tmpByte, 0, 1, this.tmpChar );

    return this.tmpChar[0] & 0xFFFF;

    }

    開發者ID:s4ke,項目名稱:moar,代碼行數:8,

    示例9: marshalStringSortValue

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    /**

    * Marshals a string-based field value.

    */

    protected static Object marshalStringSortValue(Object value) {

    if (null == value) {

    return null;

    }

    CharsRef spare = new CharsRef();

    UnicodeUtil.UTF8toUTF16((BytesRef)value, spare);

    return spare.toString();

    }

    開發者ID:europeana,項目名稱:search,代碼行數:12,

    示例10: marshalSortValue

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public Object marshalSortValue(Object value) {

    if (null == value) {

    return null;

    }

    CharsRef chars = new CharsRef();

    UnicodeUtil.UTF8toUTF16((BytesRef)value, chars);

    return NumberUtils.SortableStr2int(chars.toString());

    }

    開發者ID:europeana,項目名稱:search,代碼行數:10,

    示例11: marshalSortValue

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public Object marshalSortValue(Object value) {

    if (null == value) {

    return null;

    }

    CharsRef chars = new CharsRef();

    UnicodeUtil.UTF8toUTF16((BytesRef)value, chars);

    return NumberUtils.SortableStr2long(chars.toString());

    }

    開發者ID:europeana,項目名稱:search,代碼行數:10,

    示例12: marshalSortValue

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public Object marshalSortValue(Object value) {

    if (null == value) {

    return null;

    }

    CharsRef chars = new CharsRef();

    UnicodeUtil.UTF8toUTF16((BytesRef)value, chars);

    return NumberUtils.SortableStr2float(chars.toString());

    }

    開發者ID:europeana,項目名稱:search,代碼行數:10,

    示例13: marshalSortValue

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public Object marshalSortValue(Object value) {

    if (null == value) {

    return null;

    }

    CharsRef chars = new CharsRef();

    UnicodeUtil.UTF8toUTF16((BytesRef)value, chars);

    return NumberUtils.SortableStr2double(chars.toString());

    }

    開發者ID:europeana,項目名稱:search,代碼行數:10,

    示例14: serializeSearchGroup

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    private NamedList serializeSearchGroup(Collection> data, Sort groupSort) {

    NamedList result = new NamedList();

    CharsRef spare = new CharsRef();

    for (SearchGroup searchGroup : data) {

    Comparable[] convertedSortValues = new Comparable[searchGroup.sortValues.length];

    for (int i = 0; i < searchGroup.sortValues.length; i++) {

    Comparable sortValue = (Comparable) searchGroup.sortValues[i];

    SchemaField field = groupSort.getSort()[i].getField() != null ? searcher.getSchema().getFieldOrNull(groupSort.getSort()[i].getField()) : null;

    if (field != null) {

    FieldType fieldType = field.getType();

    if (sortValue instanceof BytesRef) {

    UnicodeUtil.UTF8toUTF16((BytesRef)sortValue, spare);

    String indexedValue = spare.toString();

    sortValue = (Comparable) fieldType.toObject(field.createField(fieldType.indexedToReadable(indexedValue), 1.0f));

    } else if (sortValue instanceof String) {

    sortValue = (Comparable) fieldType.toObject(field.createField(fieldType.indexedToReadable((String) sortValue), 1.0f));

    }

    }

    convertedSortValues[i] = sortValue;

    }

    String groupValue = searchGroup.groupValue != null ? searchGroup.groupValue.utf8ToString() : null;

    result.add(groupValue, convertedSortValues);

    }

    return result;

    }

    開發者ID:netboynb,項目名稱:search-core,代碼行數:28,

    示例15: collect

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public boolean collect(BytesRef term, int count) {

    if (count > min) {

    // NOTE: we use c>min rather than c>=min as an optimization because we are going in

    // index order, so we already know that the keys are ordered. This can be very

    // important if a lot of the counts are repeated (like zero counts would be).

    UnicodeUtil.UTF8toUTF16(term, spare);

    queue.add(new SimpleFacets.CountPair(spare.toString(), count));

    if (queue.size()>=maxsize) min=queue.last().val;

    }

    return false;

    }

    開發者ID:yintaoxue,項目名稱:read-open-source-code,代碼行數:13,

    示例16: strVal

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public String strVal(int doc) {

    termsIndex.get(doc, spare);

    if (spare.length == 0) {

    return null;

    }

    UnicodeUtil.UTF8toUTF16(spare, spareChars);

    return spareChars.toString();

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:10,

    示例17: testAllUnicodeChars

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    public void testAllUnicodeChars() throws Throwable {

    BytesRef utf8 = new BytesRef(10);

    CharsRef utf16 = new CharsRef(10);

    char[] chars = new char[2];

    for(int ch=0;ch<0x0010FFFF;ch++) {

    if (ch == 0xd800)

    // Skip invalid code points

    ch = 0xe000;

    int len = 0;

    if (ch <= 0xffff) {

    chars[len++] = (char) ch;

    } else {

    chars[len++] = (char) (((ch-0x0010000) >> 10) + UnicodeUtil.UNI_SUR_HIGH_START);

    chars[len++] = (char) (((ch-0x0010000) & 0x3FFL) + UnicodeUtil.UNI_SUR_LOW_START);

    }

    UnicodeUtil.UTF16toUTF8(chars, 0, len, utf8);

    String s1 = new String(chars, 0, len);

    String s2 = new String(utf8.bytes, 0, utf8.length, "UTF-8");

    assertEquals("codepoint " + ch, s1, s2);

    UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16);

    assertEquals("codepoint " + ch, s1, new String(utf16.chars, 0, utf16.length));

    byte[] b = s1.getBytes("UTF-8");

    assertEquals(utf8.length, b.length);

    for(int j=0;j

    assertEquals(utf8.bytes[j], b[j]);

    }

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:35,

    示例18: UTF8toUTF16

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) {

    if (charsRef.chars.length < bytes.length) {

    charsRef.chars = new char[bytes.length];

    }

    charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars);

    }

    開發者ID:baidu,項目名稱:Elasticsearch,代碼行數:7,

    示例19: indexedToReadable

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public CharsRef indexedToReadable(BytesRef input, CharsRef charsRef) {

    UnicodeUtil.UTF8toUTF16(input, charsRef);

    charsRef.append(Z_ARRAY, 0, 1);

    return charsRef;

    }

    開發者ID:europeana,項目名稱:search,代碼行數:7,

    示例20: indexedToReadable

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    /** Given an indexed term, append the human readable representation*/

    public CharsRef indexedToReadable(BytesRef input, CharsRef output) {

    UnicodeUtil.UTF8toUTF16(input, output);

    return output;

    }

    開發者ID:europeana,項目名稱:search,代碼行數:6,

    示例21: parseIntAt

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    private int parseIntAt(int offset) {

    UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+offset, scratch.length-offset, scratchUTF16);

    return ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:5,

    示例22: parseIntAt

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    private int parseIntAt(BytesRef bytes, int offset, CharsRef scratch) {

    UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset+offset, bytes.length-offset, scratch);

    return ArrayUtil.parseInt(scratch.chars, 0, scratch.length);

    }

    開發者ID:yintaoxue,項目名稱:read-open-source-code,代碼行數:5,

    示例23: readString

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    private String readString(int offset, BytesRef scratch) {

    UnicodeUtil.UTF8toUTF16(scratch.bytes, scratch.offset+offset, scratch.length-offset, scratchUTF16);

    return scratchUTF16.toString();

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:5,

    示例24: match

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    public boolean match(BytesRef term) {

    UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);

    return regexp.match(utf16wrapper, 0);

    }

    開發者ID:yintaoxue,項目名稱:read-open-source-code,代碼行數:6,

    示例25: accept

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    @Override

    protected AcceptStatus accept(BytesRef term) throws IOException {

    UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);

    return runAutomaton.run(utf16.chars, 0, utf16.length) ?

    AcceptStatus.YES : AcceptStatus.NO;

    }

    開發者ID:pkarmstr,項目名稱:NYBC,代碼行數:7,

    示例26: serializeTopDocs

    ​點讚 2

    import org.apache.lucene.util.UnicodeUtil; //導入方法依賴的package包/類

    protected NamedList serializeTopDocs(QueryCommandResult result) throws IOException {

    NamedList queryResult = new NamedList();

    queryResult.add("matches", result.getMatches());

    queryResult.add("totalHits", result.getTopDocs().totalHits);

    if (rb.getGroupingSpec().isNeedScore()) {

    queryResult.add("maxScore", result.getTopDocs().getMaxScore());

    }

    List documents = new ArrayList();

    queryResult.add("documents", documents);

    SchemaField uniqueField = rb.req.getSearcher().getSchema().getUniqueKeyField();

    CharsRef spare = new CharsRef();

    for (ScoreDoc scoreDoc : result.getTopDocs().scoreDocs) {

    NamedList document = new NamedList();

    documents.add(document);

    Document doc = retrieveDocument(uniqueField, scoreDoc.doc);

    document.add("id", uniqueField.getType().toExternal(doc.getField(uniqueField.getName())));

    if (rb.getGroupingSpec().isNeedScore()) {

    document.add("score", scoreDoc.score);

    }

    if (!FieldDoc.class.isInstance(scoreDoc)) {

    continue;

    }

    FieldDoc fieldDoc = (FieldDoc) scoreDoc;

    Object[] convertedSortValues = new Object[fieldDoc.fields.length];

    for (int j = 0; j < fieldDoc.fields.length; j++) {

    Object sortValue = fieldDoc.fields[j];

    Sort groupSort = rb.getGroupingSpec().getGroupSort();

    SchemaField field = groupSort.getSort()[j].getField() != null ? rb.req.getSearcher().getSchema().getFieldOrNull(groupSort.getSort()[j].getField()) : null;

    if (field != null) {

    FieldType fieldType = field.getType();

    if (sortValue instanceof BytesRef) {

    UnicodeUtil.UTF8toUTF16((BytesRef)sortValue, spare);

    String indexedValue = spare.toString();

    sortValue = fieldType.toObject(field.createField(fieldType.indexedToReadable(indexedValue), 1.0f));

    } else if (sortValue instanceof String) {

    sortValue = fieldType.toObject(field.createField(fieldType.indexedToReadable((String) sortValue), 1.0f));

    }

    }

    convertedSortValues[j] = sortValue;

    }

    document.add("sortValues", convertedSortValues);

    }

    return queryResult;

    }

    開發者ID:netboynb,項目名稱:search-core,代碼行數:49,

    注:本文中的org.apache.lucene.util.UnicodeUtil.UTF8toUTF16方法示例整理自Github/MSDocs等源碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。

    展开全文
  • error in utf16toutf8

    2020-12-08 19:22:56
    code.google.com/p/odbc.utf16toutf8(0x2008f0c0, 0x13, 0x100000, 0x50, 0x1, ...) C:/App/Go/src/code.google.com/p/odbc/utf16.go:49 +0x1f9 code.google.com/p/odbc.(*BaseColumn).Value(0x20089280, 0x...
  • Utf16ToUtf8

    千次阅读 2010-07-07 16:21:00
    摘自Qt源码: typedef unsigned char Uint8; typedef signed char Int8; typedef unsigned short Uint16; typedef signed short Int16; typedef signed int Int;...size_t utf16ToUtf8( char* dest, con

    摘自Qt源码:

     

    展开全文
  • UTF8转16进制工具 Utf8ToHex

    千次下载 热门讨论 2013-06-17 13:05:32
    UTF-8字符串转换为Latin1编码,比如中文“你好”转换为“\xE4\xBD\xA0\xE5\xA5\xBD”
  • function utf8to16(str) { var out, i, len, c; var char2, char3; out = ""; len = str.length; i = 0; while(i ) { c = str.charCodeAt(i++); switch(c >> ...


    <script type="text/javascript">
    var base64EncodeChars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    var base64DecodeChars = new Array(

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,

    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,

    -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,

    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,

    -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,

    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1);

    //编码的方法

    function base64encode(str) {

    var out, i, len;

    var c1, c2, c3;

    len = str.length;

    i = 0;

    out = "";

    while(i < len) {

    c1 = str.charCodeAt(i++) & 0xff;

    if(i == len)

    {

    out += base64EncodeChars.charAt(c1 >> 2);

    out += base64EncodeChars.charAt((c1 & 0x3) << 4);

    out += "==";

    break;

    }

    c2 = str.charCodeAt(i++);

    if(i == len)

    {

    out += base64EncodeChars.charAt(c1 >> 2);

    out += base64EncodeChars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));

    out += base64EncodeChars.charAt((c2 & 0xF) << 2);

    out += "=";

    break;

    }

    c3 = str.charCodeAt(i++);

    out += base64EncodeChars.charAt(c1 >> 2);

    out += base64EncodeChars.charAt(((c1 & 0x3) << 4) | ((c2 & 0xF0) >> 4));

    out += base64EncodeChars.charAt(((c2 & 0xF) << 2) | ((c3 & 0xC0) >> 6));

    out += base64EncodeChars.charAt(c3 & 0x3F);

    }

    return out;

    }

    //解码的方法

    function base64decode(str) {

    var c1, c2, c3, c4;

    var i, len, out;

    len = str.length;

    i = 0;

    out = "";

    while(i < len) {

    do {

    c1 = base64DecodeChars[str.charCodeAt(i++) & 0xff];

    } while (i < len && c1 == -1);

    if(c1 == -1)

    break;

    do {

    c2 = base64DecodeChars[str.charCodeAt(i++) & 0xff];

    } while (i < len && c2 == -1);

    if(c2 == -1)

    break;

    out += String.fromCharCode((c1 << 2) | ((c2 & 0x30) >> 4));

    do {

    c3 = str.charCodeAt(i++) & 0xff;

    if(c3 == 61)

    return out;

    c3 = base64DecodeChars[c3];

    } while (i < len && c3 == -1);

    if(c3 == -1)

    break;

    out += String.fromCharCode(((c2 & 0XF) << 4) | ((c3 & 0x3C) >> 2));

    do {

    c4 = str.charCodeAt(i++) & 0xff;

    if(c4 == 61)

    return out;

    c4 = base64DecodeChars[c4];

    } while (i < len && c4 == -1);

    if(c4 == -1)

    break;

    out += String.fromCharCode(((c3 & 0x03) << 6) | c4);

    }

    return out;

    }

    function utf16to8(str) {

    var out, i, len, c;

    out = "";

    len = str.length;

    for(i = 0; i < len; i++) {

    c = str.charCodeAt(i);

    if((c >= 0x0001) && (c <= 0x007F)) {

    out += str.charAt(i);

    } else if(c > 0x07FF) {

    out += String.fromCharCode(0xE0 | ((c >> 12) & 0x0F));

    out += String.fromCharCode(0x80 | ((c >> 6) & 0x3F));

    out += String.fromCharCode(0x80 | ((c >> 0) & 0x3F));

    } else {

    out += String.fromCharCode(0xC0 | ((c >> 6) & 0x1F));

    out += String.fromCharCode(0x80 | ((c >> 0) & 0x3F));

    }

    }

    return out;

    }

    function utf8to16(str) {

    var out, i, len, c;

    var char2, char3;

    out = "";

    len = str.length;

    i = 0;

    while(i < len) {

    c = str.charCodeAt(i++);

    switch(c >> 4)

    {

    case 0:
    case 1:
    case 2:
    case 3:
    case 4:
    case 5:
    case 6:
    case 7:

    // 0xxxxxxx

    out += str.charAt(i - 1);

    break;

    case 12:
    case 13:

    // 110x xxxx 10xx xxxx

    char2 = str.charCodeAt(i++);

    out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));

    break;

    case 14:

    // 1110 xxxx 10xx xxxx 10xx xxxx

    char2 = str.charCodeAt(i++);

    char3 = str.charCodeAt(i++);

    out += String.fromCharCode(((c & 0x0F) << 12) |

    ((char2 & 0x3F) << 6) |

    ((char3 & 0x3F) << 0));

    break;

    }

    }

    return out;

    }

    console.log(base64encode("123"))
    console.log(base64decode("MTIz"))
    </script>

    转载于:https://www.cnblogs.com/mrt-yyy/p/9358632.html

    展开全文
  • Let's back up a bit…Java's text datatypes use the UTF-16 character encoding of the Unicode character set. (As do, VB4/5/6/A/Script, JavaScript, .NET, ….) You can see this in the various operations ...
  • Code points U+0000 to U+D7FF and U+E000 to U+FFFFThe first plane (code pointsU+0000 to U+FFFF) contains the most frequently used characters andis called theBasic Multilingual PlaneorBMP. Both UTF-1...
  • <div><p>Can https://github.com/masterzen/winrm/blob/1d17eaf15943ca3554cdebb3b1b10aaa543a0b7e/powershell.go#L10-L23 be changed to use a proper UTF-8 to UTF-16 (the native windows encoding) conversion?...
  • Add UTF8 to CharSet

    2021-01-11 14:43:25
    This means that on Windows we lack the ability to specify UTF8 as the marshaling, and more generally we lack the ability to specify UTF8 marshaling regardless of platform, making writing cross-...
  • Utf8ToHex.exe

    2018-02-09 11:20:23
    非常好用的工具,将utf8文本转换成16进制数据。非常好用的工具,将utf8文本转换成16进制数据。
  • <div><p>Fixes #151. <p>Please test this.</p><p>该提问来源于开源项目:SqliteModernCpp/sqlite_modern_cpp</p></div>
  • 最近爬数据下载到的google install数据总是丢一天 两天 三天。。。 想拿来看看啥情况,结构:more没东东,cat没东东,tail可以看,vim可以看 身为Linux小白的宝宝...test.data: Little-endian UTF-16 Unicode text
  • 各位朋友,下面这个utf16to8的js方法,怎样将结果转为gb2312啊? 这是用js生成二维码的一个中文支持的方法。(http://www.cnblogs.com/pfbk/p/4848875.html就是这个),但现在只有国标的设备,只支持gb2312,所以要...
  • utf8utf16转换

    千次阅读 2019-03-14 23:18:59
    1.UTF8UTF16编码转换 std::string ConvertFromUtf16ToUtf8(const std::wstring&amp; wstr) { std::string convertedString; int requiredSize = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, 0, 0, 0...
  • WIN32 - UTF16转换UTF8

    2020-11-30 12:15:01
    VS中的unicode为UTF-16,向文件写入内容,然后使用记事本打开时会乱码.记事本默认为UTF-8编码. ...BOOL UTF16ToUTF8(const TCHAR* WideChar,DWORD SizeOfWideChar, CustomerUTF8* customerUTF8) {...
  • <div><p>I think this issue is fixed in the version 3.0, but is happening to me right now, I love Exception Notification, can some one tell me what can i do?</p><p>该提问来源于开源项目:smartinez...
  • <p>There is today no convenient and reliable way to do UTF8 encoding, and at best we have an ambiguous definition of what Unicode is. <p>There are enough bits on the metadata tables to add these two ...
  • windows utf8utf16

    2016-02-29 18:21:00
    windows utf8 转 utf16 static int MdesUtf8ToUtf16 ( const char* src, wchar_t * dst ) { #ifdef _MSC_VER int size = MultiByteToWideChar ( CP_UTF8, 0, src, -1, dst, 0 ); return M...
  • function utf16ToUtf8(s){ if(!s){ return; } var i, code, ret = [], len = s.length; for(i = 0; i ; i++){ code = s.charCodeAt(i); if(code > 0x0 && code ){ //单字节 //UTF-16 0000 - ...
  • java.lang.NoSuchMethodError: org.apache.lucene.util.UnicodeUtil.UTF16toUTF8(Ljava/lang/CharSequence;IILorg/apache/lucene/util/BytesRef;)V at org.elasticsearch.common.Unicode.unsafeFromStringAsUtf8...
  • <div><p>While reading a website, I got the above ...Encoding::ConverterNotFoundError: code converter not found (UTF-8 to utf8) </code></pre>该提问来源于开源项目:sparklemotion/mechanize</p></div>
  • SQL>ALTER DATABASE character set INTERNAL_USE ZHS16GBK; 7、重启数据库  SQL>shutdown immediate;  SQL>startup 注意: 字符集设置最好在环境搭建开始的时候修改,后期修改可能会造成数据...
  • utf-8 to unicode

    2017-05-17 00:06:07
    一、utf-8 unicode utf-16 1、unicode 使用两字节表示字符。 2、utf-8utf-16均为变长编码,使用1~4个字节来表示字符。 3、utf-8utf-16是不一样的,汉子使用 unicode 表示是两个字节,utf-8 是三个字节,utf-...
  • var ServerMessagefunction utf8to16(str) {var out, i, len, c;var char2, char3;out = "";len = str.length;i = 0;while(i < len) {c = str.charCodeAt(i++);switch(c >> 4){case 0: case 1: case 2: cas...
  • 学习记录,便于查询 u8string, u16string, u32string 由 C++20 标准提供支持 学习自 ...头文件 #include <...u8string conv_utf16_to_utf8(u16string s); u16string conv_utf8_to_utf16(u8string s); /
  • utf8 keys rather than utf16?

    2020-12-02 19:49:54
    <div><p>Is there any reason why keys cannot be stored in UTF8 format rather than UTF16? <p>Under the lmdb hood they're just a bunch of bytes and as most keys tend to be Latin using 2 bytes per ...
  • <p>This <em>looks</em> like the same symptom as #32, however, I filed a new issue since the input data is different, and because I actually seem to need two <code>decode_to_utf8</code> calls here to ...
  • def utf16_cleanup(token): return "".join(c if ord(c) < 2**16 else REPLACEMENT_CHAR for c in token) </code></pre>该提问来源于开源项目:pyenchant/pyenchant</p></div>

空空如也

空空如也

1 2 3 4 5 ... 20
收藏数 9,571
精华内容 3,828
关键字:

utf8toutf16