package example; import com.amazonaws.AmazonClientException; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.GetObjectRequest; import com.amazonaws.services.s3.model.S3Object; import edu.illinois.cs.cogcomp.thrift.base.Labeling; import edu.illinois.cs.cogcomp.thrift.base.Span; import edu.illinois.cs.cogcomp.thrift.curator.Record; import org.apache.commons.io.IOUtils; import org.apache.thrift.TDeserializer; import org.apache.thrift.TException; import java.util.List; /** * Created by haowu on 11/7/14. * An adhoc solution for reading chunk record of gigaword. */ public class GigawordChunkReader { /** * Deserialize Record, copied to here so this class is self contained. * * @param bytes * @return * @throws TException */ public static Record deserializeRecord(byte[] bytes) throws TException { Record rec = new Record(); TDeserializer td = new TDeserializer(); td.deserialize(rec, bytes); return rec; } private static GigawordChunkReader gigawordChunkReader = new GigawordChunkReader(); public static GigawordChunkReader getGigaWordChunkReader() { return gigawordChunkReader; } private AmazonS3 s3; private String bucketName; private GigawordChunkReader() { this.s3 = new AmazonS3Client(new BasicAWSCredentials("AKIAJSBHOGAQ5M4DHH2Q", "jnttSRxpJVmWxmwBPsWcOEz+ircsf16ERlboinep")); this.bucketName = "curatpr-processing-gigaword-chunk"; } /** * Return the chunk view, if we can find it. * Always pass the file name, with no folder info. * @param fName * @return */ public Record getChunkView(String fName) { if (fName.endsWith("_ner")) { fName = fName.replace("_ner", "_dependencies"); } else { if (!fName.endsWith("_dependencies")) { return null; } } // I accidentally put them into two different folder (forget to change the corpora name when uploading them, it is to late to change when I found this issue) String[] possibleLocations = { "result_coll/wiki-coref/" + fName, "result_coll/gigaword-chunk/" + fName }; for (String k : possibleLocations) { try { S3Object s3object = this.s3.getObject(new GetObjectRequest( bucketName, k)); Record r = deserializeRecord(IOUtils.toByteArray(s3object.getObjectContent())); return r; } catch( AmazonClientException ace){ }catch (Exception e) { e.printStackTrace(); } } return null; } public static void main(String[] args) { Record r = GigawordChunkReader.getGigaWordChunkReader().getChunkView("afe200206_16003_ner"); printChunks(r); } public static void printChunks(Record r){ List<Span> spans = r.getLabelViews().get("chunk").getLabels(); for(Span s: spans){ System.out.print( r.getRawText().substring(s.start,s.ending) ); System.out.print( "\t\t" ); System.out.println(s.getLabel()); } } }