這是我自己實施的S3的電子標籤。我使用S3上傳的大文件對其進行了測試,以獲取多部分eTag的參考值。
請記住,當檢查下載的文件時,壓縮和客戶端加密會使eTag無用。
Etag.java
package io.github.caillette.s3;
import com.amazonaws.services.s3.transfer.TransferManagerConfiguration;
import com.google.common.io.ByteSource;
import org.apache.commons.codec.digest.DigestUtils;
import java.io.IOException;
import java.io.InputStream;
import java.security.DigestException;
import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.google.common.base.Preconditions.checkArgument;
/**
* Represents the
* <a href="http://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html" >eTag</a>
* calculated by Amazon S3.
*/
public final class Etag {
private final String md5 ;
private final Integer partNumber ;
private static final Pattern MD5_PATTERN = Pattern.compile("[a-f0-9]{32}") ;
private static final Pattern FULL_ETAG_PATTERN
= Pattern.compile("(" + MD5_PATTERN.pattern() + ")(?:-([0-9]+))?") ;
private Etag(final byte[] md5, final Integer partNumber) {
this(md5asString(md5), partNumber) ;
}
public static String md5asString(final byte[] md5) {
checkArgument(md5.length == 16) ;
return DigestTools.toHex(md5);
}
private Etag(final String md5, final Integer partNumber) {
checkArgument(MD5_PATTERN.matcher(md5).matches()) ;
checkArgument(partNumber == null || partNumber > 0) ;
this.md5 = md5 ;
this.partNumber = partNumber ;
}
public String asString() {
return md5 + (partNumber == null ? "" : "-" + partNumber) ;
}
public static Etag parse(final String string) {
final Matcher matcher = FULL_ETAG_PATTERN.matcher(string) ;
checkArgument(matcher.matches(), "Invalid format: " + string) ;
final String md5 = matcher.group(1) ;
final String partNumber = matcher.group(2) ;
return new Etag(md5, partNumber == null ? null : Integer.parseInt(partNumber)) ;
}
@Override
public String toString() {
return getClass().getSimpleName() + "{" + asString() + "}" ;
}
@Override
public boolean equals(final Object other) {
if(this == other) {
return true ;
}
if(other == null || getClass() != other.getClass()) {
return false ;
}
final Etag etag = (Etag) other ;
if(! md5.equals(etag.md5)) {
return false ;
}
if(partNumber != null ? !partNumber.equals(etag.partNumber) : etag.partNumber != null) {
return false;
}
return true ;
}
@Override
public int hashCode() {
int result = md5.hashCode();
result = 31 * result + (partNumber != null ? partNumber.hashCode() : 0) ;
return result;
}
public static final long DEFAULT_MINIMUM_UPLOAD_PART_SIZE
= new TransferManagerConfiguration().getMinimumUploadPartSize() ;
// =======
// Compute
// =======
/**
* Calculates {@link Etag} (MD5 checksum in the AWS way).
* For small files (less than {@link #DEFAULT_MINIMUM_UPLOAD_PART_SIZE}, practically 5 GB)
* it's the MD5. For big files, it's a MD5 of the MD5 of its multipart chunks.
*
* http://permalink.gmane.org/gmane.comp.file-systems.s3.s3tools/583
* https://github.com/Teachnova/s3md5
* http://stackoverflow.com/questions/12186993/what-is-the-algorithm-to-compute-the-amazon-s3-etag-for-a-file-larger-than-5gb
*/
public static Etag compute(final ByteSource byteSource, final int chunkSize)
throws IOException, DigestException
{
final List<byte[]> md5s = new ArrayList<>() ;
try(final InputStream inputStream = byteSource.openBufferedStream()) {
while(true) {
if(inputStream.available() > 0) {
final byte[] md5 = computeMd5(inputStream, chunkSize) ;
md5s.add(md5) ;
} else {
break ;
}
}
}
if(md5s.size() == 1) {
return new Etag(md5s.get(0), null) ;
} else {
final byte[] md5concatenation = new byte[ md5s.size() * 16 ] ;
for(int i = 0 ; i < md5s.size() ; i ++) {
final byte[] md5 = md5s.get(i) ;
System.arraycopy(md5, 0, md5concatenation, i * 16, 16) ;
}
final byte[] finalMd5 = DigestUtils.md5(md5concatenation) ;
return new Etag(finalMd5, md5s.size()) ;
}
}
/*package*/ static byte[] computeMd5(
final InputStream inputStream,
final int length
) throws IOException, DigestException {
final MessageDigest md5Digest = DigestUtils.getMd5Digest() ;
final byte[] buffer = new byte[ 8192 ] ;
long totalRead = 0 ;
while(true) {
final long greatestRemainder = length - totalRead ;
final int sizeToRead = greatestRemainder > buffer.length
? buffer.length : (int) greatestRemainder ;
final int read = inputStream.read(buffer, 0, sizeToRead) ;
if(read > 0) {
md5Digest.update(buffer, 0, read) ;
totalRead += read ;
} else {
return md5Digest.digest() ;
}
}
}
}
EtagTest.java
package io.github.caillette.s3;
import com.google.common.io.Files;
import org.apache.commons.codec.digest.DigestUtils;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import org.novelang.testing.junit.MethodSupport;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import static org.assertj.core.api.Assertions.assertThat ;
public class EtagTest {
/**
* This test checks equality with an eTag calculated by S3 itself.
* To trigger multipart upload (which causes special eTag calculation),
* generate a garbage file with a size of 120_000_000L and upload it
* with {@link TransferManagerConfigurator#multipartCopyThreshold} set
* to 115343360 bytes (110 MBi).
*/
@Test
public void bigMultipart() throws Exception {
final File file = createGarbageFile(120_000_000) ;
final int chunkSize = 5 * 1024 * 1024 ;
final long start = System.currentTimeMillis() ;
final Etag etag = Etag.compute(Files.asByteSource(file), chunkSize) ;
LOGGER.info("Calculated " + etag + " in " + (System.currentTimeMillis() - start) + " ms.") ;
assertThat(etag.asString()).isEqualTo("94b81d1e846ec106c09eabc984314008-23") ;
}
@Test
public void smallMultipart() throws Exception {
final File file = createGarbageFile(30_000) ;
final int chunkSize = 10_000 ;
final Etag etag = Etag.compute(Files.asByteSource(file), chunkSize) ;
assertThat(etag.asString()).isEqualTo("056b4552c5ace587b5d62305d99e8555-3") ;
}
@Test
public void parseMonopart() throws Exception {
final Etag etag = Etag.parse("056b4552c5ace587b5d62305d99e8555") ;
assertThat(etag.asString()).isEqualTo("056b4552c5ace587b5d62305d99e8555") ;
}
@Test
public void parseMultipart() throws Exception {
final Etag etag = Etag.parse("056b4552c5ace587b5d62305d99e8555-33") ;
assertThat(etag.asString()).isEqualTo("056b4552c5ace587b5d62305d99e8555-33") ;
}
@Test
public void smallMonopart() throws Exception {
final File file = createGarbageFile(1_000) ;
final int chunkSize = 10_000 ;
final Etag etag = Etag.compute(Files.asByteSource(file), chunkSize) ;
assertThat(etag.asString()).isEqualTo("cc24b86af8f8c18ca90703db6834f3f3") ;
}
// =======
// Fixture
// =======
private static final Logger LOGGER = LoggerFactory.getLogger(EtagTest.class) ;
@Rule
public final MethodSupport methodSupport = new MethodSupport() { } ;
private byte[] createGarbageByteArray(final long length) throws IOException {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream() ;
GarbageTools.generate(byteArrayOutputStream, length) ;
return byteArrayOutputStream.toByteArray() ;
}
private File createGarbageFile(final long fileLength) throws IOException {
final File garbageFile
= File.createTempFile("garbage-", ".txt", methodSupport.getDirectory()) ;
// garbageFile.deleteOnExit() ;
final long start = System.currentTimeMillis() ;
GarbageTools.generate(garbageFile, fileLength) ;
LOGGER.info("Generated file of " + fileLength + " bytes: " + garbageFile.getAbsolutePath()
+ " in " + (System.currentTimeMillis() - start) + " ms.") ;
return garbageFile ;
}
}
GarbageTools.java
package io.github.caillette.s3;
import com.google.common.base.Charsets;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
/**
* Generates file with deterministic garbage.
*/
public final class GarbageTools {
private GarbageTools() { }
public static void generate(final File file, final long length) throws IOException {
try(
final FileOutputStream fileOutputStream = new FileOutputStream(file) ;
final OutputStream outputStream = new BufferedOutputStream(fileOutputStream)
) {
generate(outputStream, length) ;
}
}
/**
* Slow but it works.
*/
public static void generate(final OutputStream outputStream, final long length)
throws IOException
{
long bytesWritten = 0 ;
long counter = 0 ;
final StringBuilder stringBuilder = new StringBuilder() ;
while(true) {
stringBuilder.append(counter ++).append(" ") ;
final int lineLength = stringBuilder.length() ;
final boolean done = bytesWritten + lineLength >= length ;
if(done) {
final int remainder = (int) (length - bytesWritten) ;
stringBuilder.delete(remainder, stringBuilder.length()) ;
}
outputStream.write(stringBuilder.toString().getBytes(Charsets.US_ASCII)) ;
bytesWritten += stringBuilder.length() ;
stringBuilder.delete(0, stringBuilder.length()) ;
if(done) {
break ;
}
}
}
}
我想指出,如果您上傳大於5GB的文件(使用分段上傳),則S3的Etag不再是文件的簡單MD5。它看起來像文件的MD5加上一些額外的元數據,但算法沒有記錄,我知道。 – 2012-08-29 21:36:47