Jean-Francois Leveque

Retraits des solitaires des annotable en une seule passe item puis user

......@@ -58,10 +58,12 @@ public class PreprocessingRunner implements ApplicationRunner {
setFilenames();
List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename));
// associationElements = cleanupSmallCounts(associationElements, 1, 1);
List<Integer> annotateIndexes = chooseAnnotated(associationElements, 1, 1);
writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements);
List<AssociationElement> annotableElements = removeFirstSmallCounts(associationElements, 1, 1);
List<Integer> annotateIndexes = chooseAnnotated(associationElements, annotableElements, 1, 1);
writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements, annotableElements);
}
// TODO retirer duplication de code entre cleanupSmallCounts et removeFirstSmallCounts
private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) {
boolean removedUser;
......@@ -114,9 +116,59 @@ public class PreprocessingRunner implements ApplicationRunner {
return associationElements;
}
private List<Integer> chooseAnnotated(List<AssociationElement> associationElements, int userSize, int itemSize) {
private List<AssociationElement> removeFirstSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) {
boolean removedUser;
boolean removedItem;
long userCount;
long itemCount;
Set<Long> itemIdSet;
Set<Long> userIdSet;
removedUser = false;
removedItem = false;
// Books or ratings are more alone than users, so we start with them
itemIdSet = associationElements.stream().map(element -> element.getItemId()).collect(Collectors.toSet());
for (Long itemId : itemIdSet) {
userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
if (userCount <= userSize) {
associationElements = associationElements.stream().filter(element -> element.getItemId() != itemId).collect(Collectors.toList());
if (!removedItem) {
removedItem = true;
logger.debug("Removed first item");
}
logger.trace("Removed item {}", itemId);
}
}
logger.debug("Remaining AssociationElement count {}", associationElements.size());
// Then we remove users
userIdSet = associationElements.stream().map(element -> element.getUserId()).collect(Collectors.toSet());
for (Long userId : userIdSet) {
itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
if (itemCount <= itemSize) {
associationElements = associationElements.stream().filter(element -> element.getUserId() != userId).collect(Collectors.toList());
if (!removedUser) {
removedUser = true;
logger.debug("Removed first user");
}
logger.trace("Removed user {}", userId);
}
}
logger.debug("Remaining AssociationElement count {}", associationElements.size());
logger.debug("Remover item or user {}", removedUser || removedItem);
return associationElements;
}
private List<Integer> chooseAnnotated(List<AssociationElement> annotableElements, List<AssociationElement> associationElements, int userSize, int itemSize) {
List<Integer> annotatedChosen = new ArrayList<>();
int size = associationElements.size();
int size = annotableElements.size();
long userCount = 0;
long itemCount = 0;
AssociationElement randomAssociationElement;
......@@ -128,16 +180,17 @@ public class PreprocessingRunner implements ApplicationRunner {
randomInteger = new Integer(random.nextInt(size));
if (!annotatedChosen.contains(randomInteger)) {
randomAssociationElement = associationElements.get(randomInteger);
randomAssociationElement = annotableElements.get(randomInteger);
final Long itemId = randomAssociationElement.getItemId();
final Long userId = randomAssociationElement.getUserId();
userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
logger.trace("Checking new AssociationElement for annotation");
// Decreasing values based on planned suppressions
// TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this
for (Integer annotatedIndex : annotatedChosen) {
checkingAssociationElement = associationElements.get(annotatedIndex);
checkingAssociationElement = annotableElements.get(annotatedIndex);
if (checkingAssociationElement.getUserId() == userId) {
userCount--;
}
......@@ -148,6 +201,7 @@ public class PreprocessingRunner implements ApplicationRunner {
if (userCount > userSize && itemCount > itemSize) {
annotatedChosen.add(randomInteger);
logger.debug("Adding new AssociationElement to annotated, total is {}", annotatedChosen.size());
}
}
}
......@@ -155,9 +209,10 @@ public class PreprocessingRunner implements ApplicationRunner {
return annotatedChosen;
}
private void writeSampleAndAnnotated(File sampleFile, File annotatedFile, List<Integer> annotateIndexes, List<AssociationElement> associationElements) throws PreprocessingException {
private void writeSampleAndAnnotated(File sampleFile, File annotatedFile, List<Integer> annotateIndexes, List<AssociationElement> associationElements, List<AssociationElement> annotableElements) throws PreprocessingException {
try {
AssociationElement associationElement;
Integer annotableIndex;
if (ratings) {
RatingElement ratingElement;
CSVFormat ratingsFormat = CSVFormat.TDF.withHeader("itemId", "userId", "rating");
......@@ -166,8 +221,8 @@ public class PreprocessingRunner implements ApplicationRunner {
for (int i = 0; i < associationElements.size(); i++) {
ratingElement = (RatingElement) associationElements.get(i);
Integer index = new Integer(i);
if (annotateIndexes.contains(index)) {
annotableIndex = new Integer(annotableElements.indexOf(ratingElement));
if (annotableIndex >=0 && annotateIndexes.contains(annotableIndex)) {
annotatedPrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating());
} else {
samplePrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating());
......@@ -183,8 +238,8 @@ public class PreprocessingRunner implements ApplicationRunner {
for (int i = 0; i < associationElements.size(); i++) {
associationElement = associationElements.get(i);
Integer index = new Integer(i);
if (annotateIndexes.contains(index)) {
annotableIndex = new Integer(annotableElements.indexOf(associationElement));
if (annotableIndex >=0 && annotateIndexes.contains(annotableIndex)) {
annotatedPrinter.printRecord(associationElement.getItemId(), associationElement.getUserId());
} else {
samplePrinter.printRecord(associationElement.getItemId(), associationElement.getUserId());
......