dataframe
Created by: kyungee
String filepath = "/home/chris/.kaggle/competitions/talkingdata-adtracking-fraud-detection/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv";
JavaRDD<MapExample.Log> logsRDD = spark.read().csv(filepath).javaRDD().map((line) -> {
String[] attributes = line.toString().split(",");
MapExample.Log log = new MapExample.Log();
if (attributes[0].startsWith("[")) {
attributes[0] = attributes[0].substring(1);
}
log.setIp(Long.parseLong(attributes[0]));
log.setApp_id(Integer.parseInt(attributes[1]));
log.setDevice_no(Integer.parseInt(attributes[2]));
log.setOs_no(Integer.parseInt(attributes[3]));
log.setChannel_no(Integer.parseInt(attributes[4]));
log.setClick_time(attributes[5]);
if (attributes[7].endsWith("]")) {
attributes[7] = attributes[7].substring(0, attributes[7].length() - 1);
}
log.setIs_attributed(Integer.parseInt(attributes[7].trim()));
return log;
});
Dataset<Row> logDF = spark.createDataFrame(logsRDD, MapExample.Log.class);
-
Please register or login to post a comment