Jean-Francois Leveque

Nettoyage des solitaires et non-sélection des solitaires pour les annotations.

......@@ -11,10 +11,7 @@ import org.springframework.boot.ApplicationRunner;
import org.springframework.stereotype.Component;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.Random;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
......@@ -60,22 +57,100 @@ public class PreprocessingRunner implements ApplicationRunner {
loadParameters();
setFilenames();
List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename));
List<Integer> annotateIndexes = chooseAnnotated(associationElements.size());
associationElements = cleanupSmallCounts(associationElements, 1, 1);
List<Integer> annotateIndexes = chooseAnnotated(associationElements, 1, 1);
writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements);
}
private List<Integer> chooseAnnotated(int size) {
private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) {
boolean removedUser;
boolean removedItem;
long userCount;
long itemCount;
Set<Long> itemIdSet;
Set<Long> userIdSet;
do {
removedUser = false;
removedItem = false;
// Books or ratings are more alone than users, so we start with them
itemIdSet = associationElements.stream().map(element -> element.getItemId()).collect(Collectors.toSet());
for (Long itemId : itemIdSet) {
userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
if (userCount <= userSize) {
associationElements = associationElements.stream().filter(element -> element.getItemId() != itemId).collect(Collectors.toList());
if (!removedItem) {
removedItem = true;
logger.debug("Removed first item");
}
logger.trace("Removed item {}", itemId);
}
}
logger.debug("Remaining AssociationElement count {}", associationElements.size());
// Then we remove users
userIdSet = associationElements.stream().map(element -> element.getUserId()).collect(Collectors.toSet());
for (Long userId : userIdSet) {
itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
if (itemCount <= itemSize) {
associationElements = associationElements.stream().filter(element -> element.getUserId() != userId).collect(Collectors.toList());
if (!removedUser) {
removedUser = true;
logger.debug("Removed first user");
}
logger.trace("Removed user {}", userId);
}
}
logger.debug("Remaining AssociationElement count {}", associationElements.size());
logger.debug("Remover item or user {}", removedUser || removedItem);
} while (removedUser || removedItem);
return associationElements;
}
private List<Integer> chooseAnnotated(List<AssociationElement> associationElements, int userSize, int itemSize) {
List<Integer> annotatedChosen = new ArrayList<>();
int size = associationElements.size();
long userCount = 0;
long itemCount = 0;
AssociationElement randomAssociationElement;
AssociationElement checkingAssociationElement;
Random random = new Random();
Integer randomInteger;
while (annotatedChosen.size() < size * annotatePercent / 100.0) {
randomInteger = new Integer(random.nextInt(size));
if (!annotatedChosen.contains(randomInteger)) {
randomAssociationElement = associationElements.get(randomInteger);
final Long itemId = randomAssociationElement.getItemId();
final Long userId = randomAssociationElement.getUserId();
userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
// Decreasing values based on planned suppressions
// TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this
for (Integer annotatedIndex : annotatedChosen) {
checkingAssociationElement = associationElements.get(annotatedIndex);
if (checkingAssociationElement.getUserId() == userId) {
userCount--;
}
if (checkingAssociationElement.getItemId() == itemId) {
itemCount--;
}
}
if (userCount > userSize && itemCount > itemSize) {
annotatedChosen.add(randomInteger);
}
}
}
return annotatedChosen;
}
......
......@@ -11,6 +11,7 @@
<logger name="org.legrog" level="DEBUG"/>
<logger name="org.legrog.recommendation.preprocess" level="TRACE"/>
<logger name="org.legrog.recommendation.preprocess.PreprocessingRunner" level="DEBUG"/>
<root level="warn">
<appender-ref ref="STDOUT" />
......