Jean-Francois Leveque

Nettoyage des solitaires et non-sélection des solitaires pour les annotations.

...@@ -11,10 +11,7 @@ import org.springframework.boot.ApplicationRunner; ...@@ -11,10 +11,7 @@ import org.springframework.boot.ApplicationRunner;
11 import org.springframework.stereotype.Component; 11 import org.springframework.stereotype.Component;
12 12
13 import java.io.*; 13 import java.io.*;
14 -import java.util.ArrayList; 14 +import java.util.*;
15 -import java.util.List;
16 -import java.util.Properties;
17 -import java.util.Random;
18 import java.util.stream.Collectors; 15 import java.util.stream.Collectors;
19 import java.util.stream.StreamSupport; 16 import java.util.stream.StreamSupport;
20 17
...@@ -60,20 +57,98 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -60,20 +57,98 @@ public class PreprocessingRunner implements ApplicationRunner {
60 loadParameters(); 57 loadParameters();
61 setFilenames(); 58 setFilenames();
62 List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename)); 59 List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename));
63 - List<Integer> annotateIndexes = chooseAnnotated(associationElements.size()); 60 + associationElements = cleanupSmallCounts(associationElements, 1, 1);
61 + List<Integer> annotateIndexes = chooseAnnotated(associationElements, 1, 1);
64 writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements); 62 writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements);
65 } 63 }
66 64
67 - private List<Integer> chooseAnnotated(int size) { 65 + private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) {
66 +
67 + boolean removedUser;
68 + boolean removedItem;
69 + long userCount;
70 + long itemCount;
71 + Set<Long> itemIdSet;
72 + Set<Long> userIdSet;
73 +
74 + do {
75 + removedUser = false;
76 + removedItem = false;
77 +
78 + // Books or ratings are more alone than users, so we start with them
79 + itemIdSet = associationElements.stream().map(element -> element.getItemId()).collect(Collectors.toSet());
80 + for (Long itemId : itemIdSet) {
81 + userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
82 + if (userCount <= userSize) {
83 + associationElements = associationElements.stream().filter(element -> element.getItemId() != itemId).collect(Collectors.toList());
84 + if (!removedItem) {
85 + removedItem = true;
86 + logger.debug("Removed first item");
87 + }
88 + logger.trace("Removed item {}", itemId);
89 + }
90 +
91 + }
92 +
93 + logger.debug("Remaining AssociationElement count {}", associationElements.size());
94 +
95 + // Then we remove users
96 + userIdSet = associationElements.stream().map(element -> element.getUserId()).collect(Collectors.toSet());
97 + for (Long userId : userIdSet) {
98 + itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
99 + if (itemCount <= itemSize) {
100 + associationElements = associationElements.stream().filter(element -> element.getUserId() != userId).collect(Collectors.toList());
101 + if (!removedUser) {
102 + removedUser = true;
103 + logger.debug("Removed first user");
104 + }
105 + logger.trace("Removed user {}", userId);
106 + }
107 + }
108 +
109 + logger.debug("Remaining AssociationElement count {}", associationElements.size());
110 +
111 + logger.debug("Remover item or user {}", removedUser || removedItem);
112 + } while (removedUser || removedItem);
113 +
114 + return associationElements;
115 + }
116 +
117 + private List<Integer> chooseAnnotated(List<AssociationElement> associationElements, int userSize, int itemSize) {
68 List<Integer> annotatedChosen = new ArrayList<>(); 118 List<Integer> annotatedChosen = new ArrayList<>();
119 + int size = associationElements.size();
120 + long userCount = 0;
121 + long itemCount = 0;
122 + AssociationElement randomAssociationElement;
123 + AssociationElement checkingAssociationElement;
69 124
70 Random random = new Random(); 125 Random random = new Random();
71 Integer randomInteger; 126 Integer randomInteger;
72 -
73 while (annotatedChosen.size() < size * annotatePercent / 100.0) { 127 while (annotatedChosen.size() < size * annotatePercent / 100.0) {
74 randomInteger = new Integer(random.nextInt(size)); 128 randomInteger = new Integer(random.nextInt(size));
129 +
75 if (!annotatedChosen.contains(randomInteger)) { 130 if (!annotatedChosen.contains(randomInteger)) {
76 - annotatedChosen.add(randomInteger); 131 + randomAssociationElement = associationElements.get(randomInteger);
132 + final Long itemId = randomAssociationElement.getItemId();
133 + final Long userId = randomAssociationElement.getUserId();
134 + userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
135 + itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
136 +
137 + // Decreasing values based on planned suppressions
138 + // TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this
139 + for (Integer annotatedIndex : annotatedChosen) {
140 + checkingAssociationElement = associationElements.get(annotatedIndex);
141 + if (checkingAssociationElement.getUserId() == userId) {
142 + userCount--;
143 + }
144 + if (checkingAssociationElement.getItemId() == itemId) {
145 + itemCount--;
146 + }
147 + }
148 +
149 + if (userCount > userSize && itemCount > itemSize) {
150 + annotatedChosen.add(randomInteger);
151 + }
77 } 152 }
78 } 153 }
79 154
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
11 11
12 <logger name="org.legrog" level="DEBUG"/> 12 <logger name="org.legrog" level="DEBUG"/>
13 <logger name="org.legrog.recommendation.preprocess" level="TRACE"/> 13 <logger name="org.legrog.recommendation.preprocess" level="TRACE"/>
14 + <logger name="org.legrog.recommendation.preprocess.PreprocessingRunner" level="DEBUG"/>
14 15
15 <root level="warn"> 16 <root level="warn">
16 <appender-ref ref="STDOUT" /> 17 <appender-ref ref="STDOUT" />
......