@@ -17,6 +17,23 @@ import Foundation
17
17
import ModelSupport
18
18
import TensorFlow
19
19
20
+ extension Array {
21
+ func concurrentMap< B> ( _ transform: @escaping ( Element ) -> B ) -> [ B ] {
22
+ var res = Array < B ? > ( repeating: nil , count: count)
23
+ let threadCount = Swift . min ( count, 10 )
24
+ let q = DispatchQueue ( label: " sync queue " )
25
+ DispatchQueue . concurrentPerform ( iterations: threadCount) { threadId in
26
+ for idx in stride ( from: threadId, to: count, by: threadCount) {
27
+ let transformed = transform ( self [ idx] )
28
+ q. sync {
29
+ res [ idx] = transformed
30
+ }
31
+ }
32
+ }
33
+ return res. map { $0! }
34
+ }
35
+ }
36
+
20
37
public enum TextUnsupervisedVariant : String {
21
38
/// - Source: [Einstein AI WikiText-103](
22
39
/// https://blog.einstein.ai/
@@ -174,14 +191,11 @@ public struct TextUnsupervised {
174
191
let path = directory. appendingPathComponent ( " \( variantDetails. filename) / \( name) .csv " )
175
192
let documentsFull = try readCSV ( in: path)
176
193
let documents = Array ( documentsFull [ 0 ..< min ( documentCount, documentsFull. count) ] )
177
- encodedDocs = documents. map { embedding ( for: $0, bpe: bpe) }
178
- } else {
194
+ encodedDocs = documents. concurrentMap { embedding ( for: $0, bpe: bpe) }
195
+ } else {
179
196
let pathPrefix = directory. appendingPathComponent ( " \( variantDetails. encodedFileName!) / \( name) " ) . path
180
- for i in 0 ..< documentCount {
181
- encodedDocs += [
182
- try readEncoded ( in: URL ( fileURLWithPath: " \( pathPrefix) /doc_ \( i) .txt " ) )
183
- ]
184
- }
197
+ encodedDocs = ( 0 ..< documentCount) . map { URL ( fileURLWithPath: " \( pathPrefix) /doc_ \( $0) .txt " ) }
198
+ . concurrentMap { try ! readEncoded ( in: $0) }
185
199
}
186
200
187
201
return LanguageModelDataset (
0 commit comments