2011-06-06 34 views
8

我得到本地文件的MD5,但它與Amazon S3中「相同」文件的MD5(eTag)不同。我想要實現的是找出我在S3中擁有的最新文件是否與我在本地擁有的相同。如果我無法比較MD5,那我該怎麼做?來自本地文件的MD5和來自S3的MD5(eTag)不一樣

生成MD5從本地文件(截斷碼):

MessageDigest md = MessageDigest.getInstance("MD5"); 
byte[] md5 = Files.getDigest(localFile, md); 
String hashtext = DigestUtils.md5Hex(md5); 

檢索從S3 MD5(ETAG)(截斷碼):

ObjectListing objectListing = s3.listObjects(new ListObjectsRequest().withBucketName(bucketName)); 
List<S3ObjectSummary> objectSummaries = objectListing.getObjectSummaries(); 
for(S3ObjectSummary objectSummary : objectSummaries) { 
    String MD5 = objectSummary.getETag(); 
} 

PS:我用org.apache.commons.codec.digest.DigestUtilscom.google.common.io.Files庫。

+0

我想指出,如果您上傳大於5GB的文件(使用分段上傳),則S3的Etag不再是文件的簡單MD5。它看起來像文件的MD5加上一些額外的元數據,但算法沒有記錄,我知道。 – 2012-08-29 21:36:47

回答

11
String hashtext = DigestUtils.md5Hex(md5); 

是否計算剛剛計算的MD5的MD5。見DigestUtils.md5Hex documentation

hashtext實際上是MD5(MD5(文件))而不是MD5(文件)。

+1

錯誤的鏈接? – 2011-06-06 21:36:46

+0

確實,修復它 – 2011-06-06 21:38:22

+0

謝謝!現在工作。從我的角度來看,這是一種不贊成的舉動:P – okysabeni 2011-06-07 13:44:55

3

布魯諾的回答指甲它,但我想(如果你已經使用Apache下議院特別是因爲/)指出的是,如果你想這樣做沒有谷歌番石榴依賴,它實際上並不難

你會替換此:

byte[] md5 = Files.getDigest(localFile, md); 

這種(使用Java 7的嘗試初始化塊):

try (FileInputStream fis = new FileInputStream(localFile)) { 
    byte[]md5 = DigestUtils.md5(fileInputStream); 
} 

md5(InputStream) method一直在自1.4版以來的Apache Commons。

1

這是我自己實施的S3的電子標籤。我使用S3上傳的大文件對其進行了測試,以獲取多部分eTag的參考值。

請記住,當檢查下載的文件時,壓縮和客戶端加密會使eTag無用。


Etag.java

package io.github.caillette.s3; 

import com.amazonaws.services.s3.transfer.TransferManagerConfiguration; 
import com.google.common.io.ByteSource; 
import org.apache.commons.codec.digest.DigestUtils; 

import java.io.IOException; 
import java.io.InputStream; 
import java.security.DigestException; 
import java.security.MessageDigest; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.regex.Matcher; 
import java.util.regex.Pattern; 

import static com.google.common.base.Preconditions.checkArgument; 

/** 
* Represents the 
* <a href="http://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html" >eTag</a> 
* calculated by Amazon S3. 
*/ 
public final class Etag { 

    private final String md5 ; 
    private final Integer partNumber ; 

    private static final Pattern MD5_PATTERN = Pattern.compile("[a-f0-9]{32}") ; 
    private static final Pattern FULL_ETAG_PATTERN 
     = Pattern.compile("(" + MD5_PATTERN.pattern() + ")(?:-([0-9]+))?") ; 

    private Etag(final byte[] md5, final Integer partNumber) { 
    this(md5asString(md5), partNumber) ; 
    } 

    public static String md5asString(final byte[] md5) { 
    checkArgument(md5.length == 16) ; 
    return DigestTools.toHex(md5); 
    } 

    private Etag(final String md5, final Integer partNumber) { 
    checkArgument(MD5_PATTERN.matcher(md5).matches()) ; 
    checkArgument(partNumber == null || partNumber > 0) ; 
    this.md5 = md5 ; 
    this.partNumber = partNumber ; 
    } 

    public String asString() { 
    return md5 + (partNumber == null ? "" : "-" + partNumber) ; 
    } 

    public static Etag parse(final String string) { 
    final Matcher matcher = FULL_ETAG_PATTERN.matcher(string) ; 
    checkArgument(matcher.matches(), "Invalid format: " + string) ; 
    final String md5 = matcher.group(1) ; 
    final String partNumber = matcher.group(2) ; 
    return new Etag(md5, partNumber == null ? null : Integer.parseInt(partNumber)) ; 
    } 

    @Override 
    public String toString() { 
    return getClass().getSimpleName() + "{" + asString() + "}" ; 
    } 

    @Override 
    public boolean equals(final Object other) { 
    if(this == other) { 
     return true ; 
    } 
    if(other == null || getClass() != other.getClass()) { 
     return false ; 
    } 

    final Etag etag = (Etag) other ; 

    if(! md5.equals(etag.md5)) { 
     return false ; 
    } 
    if(partNumber != null ? !partNumber.equals(etag.partNumber) : etag.partNumber != null) { 
     return false; 
    } 

    return true ; 
    } 

    @Override 
    public int hashCode() { 
    int result = md5.hashCode(); 
    result = 31 * result + (partNumber != null ? partNumber.hashCode() : 0) ; 
    return result; 
    } 


    public static final long DEFAULT_MINIMUM_UPLOAD_PART_SIZE 
     = new TransferManagerConfiguration().getMinimumUploadPartSize() ; 



// ======= 
// Compute 
// ======= 

    /** 
    * Calculates {@link Etag} (MD5 checksum in the AWS way). 
    * For small files (less than {@link #DEFAULT_MINIMUM_UPLOAD_PART_SIZE}, practically 5 GB) 
    * it's the MD5. For big files, it's a MD5 of the MD5 of its multipart chunks. 
    * 
    * http://permalink.gmane.org/gmane.comp.file-systems.s3.s3tools/583 
    * https://github.com/Teachnova/s3md5 
    * http://stackoverflow.com/questions/12186993/what-is-the-algorithm-to-compute-the-amazon-s3-etag-for-a-file-larger-than-5gb 
    */ 
    public static Etag compute(final ByteSource byteSource, final int chunkSize) 
     throws IOException, DigestException 
    { 
    final List<byte[]> md5s = new ArrayList<>() ; 
    try(final InputStream inputStream = byteSource.openBufferedStream()) { 
     while(true) { 
     if(inputStream.available() > 0) { 
      final byte[] md5 = computeMd5(inputStream, chunkSize) ; 
      md5s.add(md5) ; 
     } else { 
      break ; 
     } 
     } 
    } 
    if(md5s.size() == 1) { 
     return new Etag(md5s.get(0), null) ; 
    } else { 
     final byte[] md5concatenation = new byte[ md5s.size() * 16 ] ; 
     for(int i = 0 ; i < md5s.size() ; i ++) { 
     final byte[] md5 = md5s.get(i) ; 
     System.arraycopy(md5, 0, md5concatenation, i * 16, 16) ; 
     } 
     final byte[] finalMd5 = DigestUtils.md5(md5concatenation) ; 
     return new Etag(finalMd5, md5s.size()) ; 
    } 
    } 

    /*package*/ static byte[] computeMd5(
     final InputStream inputStream, 
     final int length 
) throws IOException, DigestException { 
    final MessageDigest md5Digest = DigestUtils.getMd5Digest() ; 
    final byte[] buffer = new byte[ 8192 ] ; 
    long totalRead = 0 ; 
    while(true) { 
     final long greatestRemainder = length - totalRead ; 
     final int sizeToRead = greatestRemainder > buffer.length 
      ? buffer.length : (int) greatestRemainder ; 
     final int read = inputStream.read(buffer, 0, sizeToRead) ; 
     if(read > 0) { 
     md5Digest.update(buffer, 0, read) ; 
     totalRead += read ; 
     } else { 
     return md5Digest.digest() ; 
     } 
    } 
    } 
} 

EtagTest.java

package io.github.caillette.s3; 

import com.google.common.io.Files; 
import org.apache.commons.codec.digest.DigestUtils; 
import org.junit.Ignore; 
import org.junit.Rule; 
import org.junit.Test; 
import org.novelang.testing.junit.MethodSupport; 
import org.slf4j.Logger; 
import org.slf4j.LoggerFactory; 

import java.io.ByteArrayOutputStream; 
import java.io.File; 
import java.io.IOException; 
import java.io.InputStream; 

import static org.assertj.core.api.Assertions.assertThat ; 

public class EtagTest { 

    /** 
    * This test checks equality with an eTag calculated by S3 itself. 
    * To trigger multipart upload (which causes special eTag calculation), 
    * generate a garbage file with a size of 120_000_000L and upload it 
    * with {@link TransferManagerConfigurator#multipartCopyThreshold} set 
    * to 115343360 bytes (110 MBi). 
    */ 
    @Test 
    public void bigMultipart() throws Exception { 
    final File file = createGarbageFile(120_000_000) ; 
    final int chunkSize = 5 * 1024 * 1024 ; 
    final long start = System.currentTimeMillis() ; 
    final Etag etag = Etag.compute(Files.asByteSource(file), chunkSize) ; 
    LOGGER.info("Calculated " + etag + " in " + (System.currentTimeMillis() - start) + " ms.") ; 
    assertThat(etag.asString()).isEqualTo("94b81d1e846ec106c09eabc984314008-23") ; 
    } 

    @Test 
    public void smallMultipart() throws Exception { 
    final File file = createGarbageFile(30_000) ; 
    final int chunkSize = 10_000 ; 
    final Etag etag = Etag.compute(Files.asByteSource(file), chunkSize) ; 
    assertThat(etag.asString()).isEqualTo("056b4552c5ace587b5d62305d99e8555-3") ; 
    } 

    @Test 
    public void parseMonopart() throws Exception { 
    final Etag etag = Etag.parse("056b4552c5ace587b5d62305d99e8555") ; 
    assertThat(etag.asString()).isEqualTo("056b4552c5ace587b5d62305d99e8555") ; 
    } 

    @Test 
    public void parseMultipart() throws Exception { 
    final Etag etag = Etag.parse("056b4552c5ace587b5d62305d99e8555-33") ; 
    assertThat(etag.asString()).isEqualTo("056b4552c5ace587b5d62305d99e8555-33") ; 
    } 

    @Test 
    public void smallMonopart() throws Exception { 
    final File file = createGarbageFile(1_000) ; 
    final int chunkSize = 10_000 ; 
    final Etag etag = Etag.compute(Files.asByteSource(file), chunkSize) ; 
    assertThat(etag.asString()).isEqualTo("cc24b86af8f8c18ca90703db6834f3f3") ; 
    } 


// ======= 
// Fixture 
// ======= 

    private static final Logger LOGGER = LoggerFactory.getLogger(EtagTest.class) ; 

    @Rule 
    public final MethodSupport methodSupport = new MethodSupport() { } ; 

    private byte[] createGarbageByteArray(final long length) throws IOException { 
    final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream() ; 
    GarbageTools.generate(byteArrayOutputStream, length) ; 
    return byteArrayOutputStream.toByteArray() ; 
    } 

    private File createGarbageFile(final long fileLength) throws IOException { 
    final File garbageFile 
     = File.createTempFile("garbage-", ".txt", methodSupport.getDirectory()) ; 
// garbageFile.deleteOnExit() ; 
    final long start = System.currentTimeMillis() ; 
    GarbageTools.generate(garbageFile, fileLength) ; 
    LOGGER.info("Generated file of " + fileLength + " bytes: " + garbageFile.getAbsolutePath() 
     + " in " + (System.currentTimeMillis() - start) + " ms.") ; 
    return garbageFile ; 
    } 

} 

GarbageTools.java

package io.github.caillette.s3; 

import com.google.common.base.Charsets; 

import java.io.BufferedOutputStream; 
import java.io.File; 
import java.io.FileOutputStream; 
import java.io.IOException; 
import java.io.OutputStream; 

/** 
* Generates file with deterministic garbage. 
*/ 
public final class GarbageTools { 

    private GarbageTools() { } 

    public static void generate(final File file, final long length) throws IOException { 
    try(
     final FileOutputStream fileOutputStream = new FileOutputStream(file) ; 
     final OutputStream outputStream = new BufferedOutputStream(fileOutputStream) 
    ) { 
     generate(outputStream, length) ; 
    } 
    } 

    /** 
    * Slow but it works. 
    */ 
    public static void generate(final OutputStream outputStream, final long length) 
     throws IOException 
    { 
    long bytesWritten = 0 ; 
    long counter = 0 ; 
    final StringBuilder stringBuilder = new StringBuilder() ; 
    while(true) { 
     stringBuilder.append(counter ++).append(" ") ; 
     final int lineLength = stringBuilder.length() ; 
     final boolean done = bytesWritten + lineLength >= length ; 
     if(done) { 
     final int remainder = (int) (length - bytesWritten) ; 
     stringBuilder.delete(remainder, stringBuilder.length()) ; 
     } 
     outputStream.write(stringBuilder.toString().getBytes(Charsets.US_ASCII)) ; 
     bytesWritten += stringBuilder.length() ; 
     stringBuilder.delete(0, stringBuilder.length()) ; 
     if(done) { 
     break ; 
     } 
    } 
    } 
}