package simpledb;
import java.io.*;
import java.util.ArrayList;
/**
* HeapFileEncoder reads a comma delimited text file or accepts
* an array of tuples and converts it to
* pages of binary data in the appropriate format for simpledb heap pages
* Pages are padded out to a specified length, and written consecutive in a
* data file.
*/
public class HeapFileEncoder {
/** Convert the specified tuple list (with only integer fields) into a binary
* page file.
* The format of the output file will be as specified in HeapPage and
* HeapFile.
*
* @see HeapPage
* @see HeapFile
* @param inFile The input file to read data from
* @param outFile The output file to write data to
* @param npagebytes The number of bytes per page in the output file
* @param numFields the number of fields in each input line/output tuple
* @throws IOException if the input/output file can't be opened or a
* malformed input line is encountered
*/
public static void convert(File inFile, File outFile, int npagebytes,
int numFields, Type[] typeAr, char fieldSeparator)
throws IOException {
int nrecbytes = 0;
for (int i = 0; i < numFields ; i++) {
nrecbytes += typeAr[i].getLen();
}
int nrecords = (npagebytes * 8) / (nrecbytes * 8 + 1); //floor comes for free
// per record, we need one bit; there are nrecords per page, so we need
// nrecords bits, i.e., ((nrecords/32)+1) integers.
int nheaderbytes = (nrecords / 8);
if (nheaderbytes * 8 < nrecords)
nheaderbytes++; //ceiling
int nheaderbits = nheaderbytes * 8;
BufferedReader br = new BufferedReader(new FileReader(inFile));
FileOutputStream os = new FileOutputStream(outFile);
// our numbers probably won't be much larger than 1024 digits
char buf[] = new char[1024];
int curpos = 0;
int recordcount = 0;
int npages = 0;
int fieldNo = 0;
ByteArrayOutputStream headerBAOS = new ByteArrayOutputStream(nheaderbytes);
DataOutputStream headerStream = new DataOutputStream(headerBAOS);
ByteArrayOutputStream pageBAOS = new ByteArrayOutputStream(npagebytes);
DataOutputStream pageStream = new DataOutputStream(pageBAOS);
boolean done = false;
boolean first = true;
while (!done) {
int c = br.read();
// Ignore Windows/Notepad special line endings
if (c == '\r')
continue;
if (c == '\n') {
if (first)
continue;
recordcount++;
first = true;
} else
first = false;
if (c == fieldSeparator || c == '\n' || c == '\r') {
String s = new String(buf, 0, curpos);
if (typeAr[fieldNo] == Type.INT_TYPE) {
try {
pageStream.writeInt(Integer.parseInt(s.trim()));
} catch (NumberFormatException e) {
System.out.println ("BAD LINE : " + s);
}
}
else if (typeAr[fieldNo] == Type.STRING_TYPE) {
s = s.trim();
int overflow = Type.STRING_LEN - s.length();
if (overflow < 0) {
String news = s.substring(0,Type.STRING_LEN);
s = news;
}
pageStream.writeInt(s.length());
pageStream.writeBytes(s);
while (overflow-- > 0)
pageStream.write((byte)0);
}
curpos = 0;
if (c == '\n')
fieldNo = 0;
else
fieldNo++;
} else if (c == -1) {
done = true;
} else {
buf[curpos++] = (char)c;
continue;
}
// if we wrote a full page of records, or if we're done altogether,
// write out the header of the page.
//
// in the header, write a 1 for bits that correspond to records we've
// written and 0 for empty slots.
//
// when we're done, also flush the page to disk, but only if it has
// records on it. however, if this file is empty, do flush an empty
// page to disk.
if (recordcount >= nrecords
|| done && recordcount > 0
|| done && npages == 0) {
int i = 0;
byte headerbyte = 0;
for (i=0; i
*
* The format of the output file will be as specified in HeapPage and
* HeapFile.
*
* @see HeapPage
* @see HeapFile
* @param tuples the tuples - a list of tuples, each represented by a list of integers that are
* the field values for that tuple.
* @param outFile The output file to write data to
* @param npagebytes The number of bytes per page in the output file
* @param numFields the number of fields in each input tuple
* @throws IOException if the temporary/output file can't be opened
*/
public static void convert(ArrayList
* Assume format of the input file is (note that only integer fields are
* supported):
* int,...,int\n
* int,...,int\n
* ...
* where each row represents a tuple.
*