Jean-Francois Leveque

Retraits des solitaires des annotable en une seule passe item puis user

...@@ -58,10 +58,12 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -58,10 +58,12 @@ public class PreprocessingRunner implements ApplicationRunner {
58 setFilenames(); 58 setFilenames();
59 List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename)); 59 List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename));
60 // associationElements = cleanupSmallCounts(associationElements, 1, 1); 60 // associationElements = cleanupSmallCounts(associationElements, 1, 1);
61 - List<Integer> annotateIndexes = chooseAnnotated(associationElements, 1, 1); 61 + List<AssociationElement> annotableElements = removeFirstSmallCounts(associationElements, 1, 1);
62 - writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements); 62 + List<Integer> annotateIndexes = chooseAnnotated(associationElements, annotableElements, 1, 1);
63 + writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements, annotableElements);
63 } 64 }
64 65
66 + // TODO retirer duplication de code entre cleanupSmallCounts et removeFirstSmallCounts
65 private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) { 67 private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) {
66 68
67 boolean removedUser; 69 boolean removedUser;
...@@ -114,9 +116,59 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -114,9 +116,59 @@ public class PreprocessingRunner implements ApplicationRunner {
114 return associationElements; 116 return associationElements;
115 } 117 }
116 118
117 - private List<Integer> chooseAnnotated(List<AssociationElement> associationElements, int userSize, int itemSize) { 119 + private List<AssociationElement> removeFirstSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) {
120 +
121 + boolean removedUser;
122 + boolean removedItem;
123 + long userCount;
124 + long itemCount;
125 + Set<Long> itemIdSet;
126 + Set<Long> userIdSet;
127 +
128 + removedUser = false;
129 + removedItem = false;
130 +
131 + // Books or ratings are more alone than users, so we start with them
132 + itemIdSet = associationElements.stream().map(element -> element.getItemId()).collect(Collectors.toSet());
133 + for (Long itemId : itemIdSet) {
134 + userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
135 + if (userCount <= userSize) {
136 + associationElements = associationElements.stream().filter(element -> element.getItemId() != itemId).collect(Collectors.toList());
137 + if (!removedItem) {
138 + removedItem = true;
139 + logger.debug("Removed first item");
140 + }
141 + logger.trace("Removed item {}", itemId);
142 + }
143 +
144 + }
145 +
146 + logger.debug("Remaining AssociationElement count {}", associationElements.size());
147 +
148 + // Then we remove users
149 + userIdSet = associationElements.stream().map(element -> element.getUserId()).collect(Collectors.toSet());
150 + for (Long userId : userIdSet) {
151 + itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
152 + if (itemCount <= itemSize) {
153 + associationElements = associationElements.stream().filter(element -> element.getUserId() != userId).collect(Collectors.toList());
154 + if (!removedUser) {
155 + removedUser = true;
156 + logger.debug("Removed first user");
157 + }
158 + logger.trace("Removed user {}", userId);
159 + }
160 + }
161 +
162 + logger.debug("Remaining AssociationElement count {}", associationElements.size());
163 +
164 + logger.debug("Remover item or user {}", removedUser || removedItem);
165 +
166 + return associationElements;
167 + }
168 +
169 + private List<Integer> chooseAnnotated(List<AssociationElement> annotableElements, List<AssociationElement> associationElements, int userSize, int itemSize) {
118 List<Integer> annotatedChosen = new ArrayList<>(); 170 List<Integer> annotatedChosen = new ArrayList<>();
119 - int size = associationElements.size(); 171 + int size = annotableElements.size();
120 long userCount = 0; 172 long userCount = 0;
121 long itemCount = 0; 173 long itemCount = 0;
122 AssociationElement randomAssociationElement; 174 AssociationElement randomAssociationElement;
...@@ -128,16 +180,17 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -128,16 +180,17 @@ public class PreprocessingRunner implements ApplicationRunner {
128 randomInteger = new Integer(random.nextInt(size)); 180 randomInteger = new Integer(random.nextInt(size));
129 181
130 if (!annotatedChosen.contains(randomInteger)) { 182 if (!annotatedChosen.contains(randomInteger)) {
131 - randomAssociationElement = associationElements.get(randomInteger); 183 + randomAssociationElement = annotableElements.get(randomInteger);
132 final Long itemId = randomAssociationElement.getItemId(); 184 final Long itemId = randomAssociationElement.getItemId();
133 final Long userId = randomAssociationElement.getUserId(); 185 final Long userId = randomAssociationElement.getUserId();
134 userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count(); 186 userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count();
135 itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count(); 187 itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count();
188 + logger.trace("Checking new AssociationElement for annotation");
136 189
137 // Decreasing values based on planned suppressions 190 // Decreasing values based on planned suppressions
138 // TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this 191 // TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this
139 for (Integer annotatedIndex : annotatedChosen) { 192 for (Integer annotatedIndex : annotatedChosen) {
140 - checkingAssociationElement = associationElements.get(annotatedIndex); 193 + checkingAssociationElement = annotableElements.get(annotatedIndex);
141 if (checkingAssociationElement.getUserId() == userId) { 194 if (checkingAssociationElement.getUserId() == userId) {
142 userCount--; 195 userCount--;
143 } 196 }
...@@ -148,6 +201,7 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -148,6 +201,7 @@ public class PreprocessingRunner implements ApplicationRunner {
148 201
149 if (userCount > userSize && itemCount > itemSize) { 202 if (userCount > userSize && itemCount > itemSize) {
150 annotatedChosen.add(randomInteger); 203 annotatedChosen.add(randomInteger);
204 + logger.debug("Adding new AssociationElement to annotated, total is {}", annotatedChosen.size());
151 } 205 }
152 } 206 }
153 } 207 }
...@@ -155,9 +209,10 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -155,9 +209,10 @@ public class PreprocessingRunner implements ApplicationRunner {
155 return annotatedChosen; 209 return annotatedChosen;
156 } 210 }
157 211
158 - private void writeSampleAndAnnotated(File sampleFile, File annotatedFile, List<Integer> annotateIndexes, List<AssociationElement> associationElements) throws PreprocessingException { 212 + private void writeSampleAndAnnotated(File sampleFile, File annotatedFile, List<Integer> annotateIndexes, List<AssociationElement> associationElements, List<AssociationElement> annotableElements) throws PreprocessingException {
159 try { 213 try {
160 AssociationElement associationElement; 214 AssociationElement associationElement;
215 + Integer annotableIndex;
161 if (ratings) { 216 if (ratings) {
162 RatingElement ratingElement; 217 RatingElement ratingElement;
163 CSVFormat ratingsFormat = CSVFormat.TDF.withHeader("itemId", "userId", "rating"); 218 CSVFormat ratingsFormat = CSVFormat.TDF.withHeader("itemId", "userId", "rating");
...@@ -166,8 +221,8 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -166,8 +221,8 @@ public class PreprocessingRunner implements ApplicationRunner {
166 221
167 for (int i = 0; i < associationElements.size(); i++) { 222 for (int i = 0; i < associationElements.size(); i++) {
168 ratingElement = (RatingElement) associationElements.get(i); 223 ratingElement = (RatingElement) associationElements.get(i);
169 - Integer index = new Integer(i); 224 + annotableIndex = new Integer(annotableElements.indexOf(ratingElement));
170 - if (annotateIndexes.contains(index)) { 225 + if (annotableIndex >=0 && annotateIndexes.contains(annotableIndex)) {
171 annotatedPrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating()); 226 annotatedPrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating());
172 } else { 227 } else {
173 samplePrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating()); 228 samplePrinter.printRecord(ratingElement.getItemId(), ratingElement.getUserId(), ratingElement.getRating());
...@@ -183,8 +238,8 @@ public class PreprocessingRunner implements ApplicationRunner { ...@@ -183,8 +238,8 @@ public class PreprocessingRunner implements ApplicationRunner {
183 238
184 for (int i = 0; i < associationElements.size(); i++) { 239 for (int i = 0; i < associationElements.size(); i++) {
185 associationElement = associationElements.get(i); 240 associationElement = associationElements.get(i);
186 - Integer index = new Integer(i); 241 + annotableIndex = new Integer(annotableElements.indexOf(associationElement));
187 - if (annotateIndexes.contains(index)) { 242 + if (annotableIndex >=0 && annotateIndexes.contains(annotableIndex)) {
188 annotatedPrinter.printRecord(associationElement.getItemId(), associationElement.getUserId()); 243 annotatedPrinter.printRecord(associationElement.getItemId(), associationElement.getUserId());
189 } else { 244 } else {
190 samplePrinter.printRecord(associationElement.getItemId(), associationElement.getUserId()); 245 samplePrinter.printRecord(associationElement.getItemId(), associationElement.getUserId());
......