2013-07-31 16 views

回答

1
  1. 首先讀取一個無符號的短符號,它是字符串的長度。
  2. 重複字符串長度的步驟如下:
  3. 讀取一個字節。如果字節匹配位模式0xxxxxxx,那麼它是1個字符。如果字節匹配位模式110xxxxx,那麼該字符由2個字節(unicode)組成。如果字節匹配位模式1110xxxx,則字符由3個字節組成。當這個新字符被組裝時,它被追加到要返回的字符串的末尾。

眼看功能後面的代碼可能會有幫助:

public final static String readUTF(DataInput in) throws IOException { 
int utflen = in.readUnsignedShort(); 
byte[] bytearr = null; 
char[] chararr = null; 
if (in instanceof DataInputStream) { 
    DataInputStream dis = (DataInputStream)in; 
    if (dis.bytearr.length < utflen){ 
     dis.bytearr = new byte[utflen*2]; 
     dis.chararr = new char[utflen*2]; 
    } 
    chararr = dis.chararr; 
    bytearr = dis.bytearr; 
} else { 
    bytearr = new byte[utflen]; 
    chararr = new char[utflen]; 
} 

int c, char2, char3; 
int count = 0; 
int chararr_count=0; 

in.readFully(bytearr, 0, utflen); 

while (count < utflen) { 
    c = (int) bytearr[count] & 0xff; 
    if (c > 127) break; 
    count++; 
    chararr[chararr_count++]=(char)c; 
} 

while (count < utflen) { 
    c = (int) bytearr[count] & 0xff; 
    switch (c >> 4) { 
     case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: 
      /* 0xxxxxxx*/ 
      count++; 
      chararr[chararr_count++]=(char)c; 
      break; 
     case 12: case 13: 
      /* 110x xxxx 10xx xxxx*/ 
      count += 2; 
      if (count > utflen) 
       throw new UTFDataFormatException(
        "malformed input: partial character at end"); 
      char2 = (int) bytearr[count-1]; 
      if ((char2 & 0xC0) != 0x80) 
       throw new UTFDataFormatException(
        "malformed input around byte " + count); 
      chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | 
              (char2 & 0x3F)); 
      break; 
     case 14: 
      /* 1110 xxxx 10xx xxxx 10xx xxxx */ 
      count += 3; 
      if (count > utflen) 
       throw new UTFDataFormatException(
        "malformed input: partial character at end"); 
      char2 = (int) bytearr[count-2]; 
      char3 = (int) bytearr[count-1]; 
      if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) 
       throw new UTFDataFormatException(
        "malformed input around byte " + (count-1)); 
      chararr[chararr_count++]=(char)(((c  & 0x0F) << 12) | 
              ((char2 & 0x3F) << 6) | 
              ((char3 & 0x3F) << 0)); 
      break; 
     default: 
      /* 10xx xxxx, 1111 xxxx */ 
      throw new UTFDataFormatException(
       "malformed input around byte " + count); 
    } 
} 
// The number of chars produced may be less than utflen 
return new String(chararr, 0, chararr_count); 

}

+0

「2個字節(Unicode)」並不是Javadoc中所說的準確縮減。 – EJP