Nettoyage des solitaires et non-sélection des solitaires pour les annotations.
Showing
2 changed files
with
84 additions
and
8 deletions
... | @@ -11,10 +11,7 @@ import org.springframework.boot.ApplicationRunner; | ... | @@ -11,10 +11,7 @@ import org.springframework.boot.ApplicationRunner; |
11 | import org.springframework.stereotype.Component; | 11 | import org.springframework.stereotype.Component; |
12 | 12 | ||
13 | import java.io.*; | 13 | import java.io.*; |
14 | -import java.util.ArrayList; | 14 | +import java.util.*; |
15 | -import java.util.List; | ||
16 | -import java.util.Properties; | ||
17 | -import java.util.Random; | ||
18 | import java.util.stream.Collectors; | 15 | import java.util.stream.Collectors; |
19 | import java.util.stream.StreamSupport; | 16 | import java.util.stream.StreamSupport; |
20 | 17 | ||
... | @@ -60,20 +57,98 @@ public class PreprocessingRunner implements ApplicationRunner { | ... | @@ -60,20 +57,98 @@ public class PreprocessingRunner implements ApplicationRunner { |
60 | loadParameters(); | 57 | loadParameters(); |
61 | setFilenames(); | 58 | setFilenames(); |
62 | List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename)); | 59 | List<AssociationElement> associationElements = loadAssociationElements(new File(dataDir, completeFilename)); |
63 | - List<Integer> annotateIndexes = chooseAnnotated(associationElements.size()); | 60 | + associationElements = cleanupSmallCounts(associationElements, 1, 1); |
61 | + List<Integer> annotateIndexes = chooseAnnotated(associationElements, 1, 1); | ||
64 | writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements); | 62 | writeSampleAndAnnotated(new File(dataDir, sampleFilename), new File(dataDir, annontatedFilename), annotateIndexes, associationElements); |
65 | } | 63 | } |
66 | 64 | ||
67 | - private List<Integer> chooseAnnotated(int size) { | 65 | + private List<AssociationElement> cleanupSmallCounts(List<AssociationElement> associationElements, int userSize, int itemSize) { |
66 | + | ||
67 | + boolean removedUser; | ||
68 | + boolean removedItem; | ||
69 | + long userCount; | ||
70 | + long itemCount; | ||
71 | + Set<Long> itemIdSet; | ||
72 | + Set<Long> userIdSet; | ||
73 | + | ||
74 | + do { | ||
75 | + removedUser = false; | ||
76 | + removedItem = false; | ||
77 | + | ||
78 | + // Books or ratings are more alone than users, so we start with them | ||
79 | + itemIdSet = associationElements.stream().map(element -> element.getItemId()).collect(Collectors.toSet()); | ||
80 | + for (Long itemId : itemIdSet) { | ||
81 | + userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count(); | ||
82 | + if (userCount <= userSize) { | ||
83 | + associationElements = associationElements.stream().filter(element -> element.getItemId() != itemId).collect(Collectors.toList()); | ||
84 | + if (!removedItem) { | ||
85 | + removedItem = true; | ||
86 | + logger.debug("Removed first item"); | ||
87 | + } | ||
88 | + logger.trace("Removed item {}", itemId); | ||
89 | + } | ||
90 | + | ||
91 | + } | ||
92 | + | ||
93 | + logger.debug("Remaining AssociationElement count {}", associationElements.size()); | ||
94 | + | ||
95 | + // Then we remove users | ||
96 | + userIdSet = associationElements.stream().map(element -> element.getUserId()).collect(Collectors.toSet()); | ||
97 | + for (Long userId : userIdSet) { | ||
98 | + itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count(); | ||
99 | + if (itemCount <= itemSize) { | ||
100 | + associationElements = associationElements.stream().filter(element -> element.getUserId() != userId).collect(Collectors.toList()); | ||
101 | + if (!removedUser) { | ||
102 | + removedUser = true; | ||
103 | + logger.debug("Removed first user"); | ||
104 | + } | ||
105 | + logger.trace("Removed user {}", userId); | ||
106 | + } | ||
107 | + } | ||
108 | + | ||
109 | + logger.debug("Remaining AssociationElement count {}", associationElements.size()); | ||
110 | + | ||
111 | + logger.debug("Remover item or user {}", removedUser || removedItem); | ||
112 | + } while (removedUser || removedItem); | ||
113 | + | ||
114 | + return associationElements; | ||
115 | + } | ||
116 | + | ||
117 | + private List<Integer> chooseAnnotated(List<AssociationElement> associationElements, int userSize, int itemSize) { | ||
68 | List<Integer> annotatedChosen = new ArrayList<>(); | 118 | List<Integer> annotatedChosen = new ArrayList<>(); |
119 | + int size = associationElements.size(); | ||
120 | + long userCount = 0; | ||
121 | + long itemCount = 0; | ||
122 | + AssociationElement randomAssociationElement; | ||
123 | + AssociationElement checkingAssociationElement; | ||
69 | 124 | ||
70 | Random random = new Random(); | 125 | Random random = new Random(); |
71 | Integer randomInteger; | 126 | Integer randomInteger; |
72 | - | ||
73 | while (annotatedChosen.size() < size * annotatePercent / 100.0) { | 127 | while (annotatedChosen.size() < size * annotatePercent / 100.0) { |
74 | randomInteger = new Integer(random.nextInt(size)); | 128 | randomInteger = new Integer(random.nextInt(size)); |
129 | + | ||
75 | if (!annotatedChosen.contains(randomInteger)) { | 130 | if (!annotatedChosen.contains(randomInteger)) { |
76 | - annotatedChosen.add(randomInteger); | 131 | + randomAssociationElement = associationElements.get(randomInteger); |
132 | + final Long itemId = randomAssociationElement.getItemId(); | ||
133 | + final Long userId = randomAssociationElement.getUserId(); | ||
134 | + userCount = associationElements.stream().filter(element -> element.getItemId() == itemId).count(); | ||
135 | + itemCount = associationElements.stream().filter(element -> element.getUserId() == userId).count(); | ||
136 | + | ||
137 | + // Decreasing values based on planned suppressions | ||
138 | + // TODO Refactor writeSampleAndAnnotated and chooseAnnotated to avoid this | ||
139 | + for (Integer annotatedIndex : annotatedChosen) { | ||
140 | + checkingAssociationElement = associationElements.get(annotatedIndex); | ||
141 | + if (checkingAssociationElement.getUserId() == userId) { | ||
142 | + userCount--; | ||
143 | + } | ||
144 | + if (checkingAssociationElement.getItemId() == itemId) { | ||
145 | + itemCount--; | ||
146 | + } | ||
147 | + } | ||
148 | + | ||
149 | + if (userCount > userSize && itemCount > itemSize) { | ||
150 | + annotatedChosen.add(randomInteger); | ||
151 | + } | ||
77 | } | 152 | } |
78 | } | 153 | } |
79 | 154 | ... | ... |
... | @@ -11,6 +11,7 @@ | ... | @@ -11,6 +11,7 @@ |
11 | 11 | ||
12 | <logger name="org.legrog" level="DEBUG"/> | 12 | <logger name="org.legrog" level="DEBUG"/> |
13 | <logger name="org.legrog.recommendation.preprocess" level="TRACE"/> | 13 | <logger name="org.legrog.recommendation.preprocess" level="TRACE"/> |
14 | + <logger name="org.legrog.recommendation.preprocess.PreprocessingRunner" level="DEBUG"/> | ||
14 | 15 | ||
15 | <root level="warn"> | 16 | <root level="warn"> |
16 | <appender-ref ref="STDOUT" /> | 17 | <appender-ref ref="STDOUT" /> | ... | ... |
-
Please register or login to post a comment