fix(java-input): support Java modified UTF-8 strings (PR #2654)

* [core] fix jadx.plugins.input.java.data.ConstPoolReader.parseString() error with Kotlin Annotation of byte array as a String to follow jvms-4.4.7 rules for encoding annotation strings in class files

* move decode method into utility class, add test, fix code style

---------

Co-authored-by: Skylot <118523+skylot@users.noreply.github.com>
This commit is contained in:
wech71
2025-10-12 00:57:00 +02:00
committed by GitHub
parent 5f1985f281
commit 0f495afc99
3 changed files with 106 additions and 3 deletions
@@ -1,6 +1,5 @@
package jadx.plugins.input.java.data;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -23,6 +22,7 @@ import jadx.plugins.input.java.data.attributes.types.JavaBootstrapMethodsAttr;
import jadx.plugins.input.java.data.attributes.types.data.RawBootstrapMethod;
import jadx.plugins.input.java.utils.DescriptorParser;
import jadx.plugins.input.java.utils.JavaClassParseException;
import jadx.plugins.input.java.utils.ModifiedUTF8Decoder;
public class ConstPoolReader {
private final JavaClassReader clsReader;
@@ -235,8 +235,7 @@ public class ConstPoolReader {
@NotNull
private String parseString(byte[] bytes) {
// TODO: parse modified UTF-8
return new String(bytes, StandardCharsets.UTF_8);
return ModifiedUTF8Decoder.decodeString(bytes);
}
private String fixType(String clsName) {
@@ -0,0 +1,75 @@
package jadx.plugins.input.java.utils;
import java.nio.charset.StandardCharsets;
public class ModifiedUTF8Decoder {
public static String decodeString(byte[] bytes) {
int len = bytes.length;
// quick check if all chars are 7-bit
boolean asciiStr = true;
for (byte b : bytes) {
if ((b & 0x80) != 0) {
asciiStr = false;
break;
}
}
if (asciiStr) {
return new String(bytes, StandardCharsets.US_ASCII);
}
// parse modified UTF-8 according jvms-4.4.7
StringBuilder sb = new StringBuilder();
for (int i = 0; i < len; i++) {
int x = bytes[i] & 0xff;
// 4.4 ascii characters 1-127 (0 is encoded as 0xc0 0x80)
if ((x & 0x80) == 0) {
// 1 byte 7-Bit ascii (Table 4.4./4.5)
sb.append((char) x);
} else {
if (i + 1 >= len) {
throw new JavaClassParseException("Inconsistent byte array structure: too short");
}
int y = bytes[i + 1] & 0xff;
// 0 is encoded as 0xc0 0x80 (jvms-4.4.7)
if (x == 0xc0 && y == 0x80) {
sb.appendCodePoint(0);
i++;
} else if ((x & 0xE0) == 0xC0 && (y & 0xC0) == 0x80) {
// 2 byte char (Table 4.8./4.9 )
sb.appendCodePoint(((x & 0x1f) << 6) + (y & 0x3f));
i++;
} else if (i + 2 < len) {
int z = bytes[i + 2] & 0xff;
if ((x & 0xF0) == 0xE0 && (y & 0xC0) == 0x80 && (z & 0xC0) == 0x80) {
// 3 byte char (Table 4.11/4.12)
sb.appendCodePoint(((x & 0xf) << 12) + ((y & 0x3f) << 6) + (z & 0x3f));
i += 2;
} else if (i + 5 < len
&& x == 0xED // u
&& (y & 0xF0) == 0xA0 // v
&& (bytes[i + 3] & 0xff) == 0xED // x
&& (bytes[i + 4] & 0xF0) == 0xA0 // y
) {
// 6 byte encoded Table 4.12.
int u = x; // 0
int v = y; // 1
int w = z; // 2
x = bytes[i + 3] & 0xff;
y = bytes[i + 4] & 0xff;
z = bytes[i + 5] & 0xff;
if (x == 0xED && (y & 0xF0) == 0xA0) {
sb.appendCodePoint(0x10000 + ((v & 0x0f) << 16) + ((w & 0x3f) << 10) + ((y & 0x0f) << 6) + (z & 0x3f));
i += 5;
} else {
throw new JavaClassParseException("Inconsistent byte array structure: invalid 6 bytes char");
}
} else {
throw new JavaClassParseException("Inconsistent byte array structure: unexpected char");
}
}
}
}
return sb.toString();
}
}
@@ -0,0 +1,29 @@
package jadx.plugins.input.java.utils;
import org.junit.jupiter.api.Test;
import static jadx.plugins.input.java.utils.ModifiedUTF8Decoder.decodeString;
import static org.assertj.core.api.Assertions.assertThat;
/*
* TODO: find a way to enter 6-bytes char decode branch
*/
class ModifiedUTF8DecoderTest {
@Test
public void test() {
String str = "aÆřᛒቶ北𝄠😀🨄𐆙";
byte[] mUTF8Bytes = new byte[] { 97, -61, -122, -59, -103, -31, -101, -110, -31, -119, -74, -17,
-91, -93, -19, -96, -76, -19, -76, -96, -19, -96, -67, -19, -72,
-128, -19, -96, -66, -19, -72, -124, -19, -96, -128, -19, -74, -103 };
assertThat(decodeString(mUTF8Bytes)).isEqualTo(str);
}
@Test
public void testASCIIOnly() {
String str = "Hello, world!";
byte[] mUTF8Bytes = new byte[] { 72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 33 };
assertThat(decodeString(mUTF8Bytes)).isEqualTo(str);
}
}