Skip to content

Vectorize UTF16->UTF8 transcoding #83073

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 69 additions & 15 deletions stdlib/public/core/StringCreate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -247,26 +247,57 @@ extension String {
initializingFrom: input, isASCII: isASCII)
return storage.asString
}

internal static func _fromUTF16(
_ input: UnsafeBufferPointer<UInt16>,
repairing: Bool = true
) -> (String, repairsMade: Bool)? {
if input.isEmpty { return ("", repairsMade: false) }
guard let (utf8Len, isASCII) = unsafe utf8Length(
of: input,
repairing: repairing
) else {
return nil
}
var repairsMade = false
if utf8Len <= _SmallString.capacity {
let smol = unsafe _SmallString(initializingUTF8With: {
let (count, tmpRepairsMade) = unsafe transcodeUTF16ToUTF8(
UTF16CodeUnits: input,
into: $0,
repairing: repairing
)
repairsMade = tmpRepairsMade
return count
})
return (String(_StringGuts(smol)), repairsMade: repairsMade)
}
let result = unsafe __StringStorage.create(
uninitializedCodeUnitCapacity: utf8Len,
initializingUncheckedUTF8With: { buffer -> Int in
let (count, tmpRepairsMade) = unsafe transcodeUTF16ToUTF8(
UTF16CodeUnits: input,
into: buffer,
repairing: repairing
)
repairsMade = tmpRepairsMade
return count
}
)
result._updateCountAndFlags(
newCount: result.count,
newIsASCII: isASCII
)
return (result.asString, repairsMade: repairsMade)
}

@usableFromInline
internal static func _uncheckedFromUTF16(
_ input: UnsafeBufferPointer<UInt16>
) -> String {
// TODO(String Performance): Attempt to form smol strings

// TODO(String performance): Skip intermediary array, transcode directly
// into a StringStorage space.
var contents: [UInt8] = []
contents.reserveCapacity(input.count)
let repaired = unsafe transcode(
input.makeIterator(),
from: UTF16.self,
to: UTF8.self,
stoppingOnError: false,
into: { contents.append($0) })
let (result, repaired) = unsafe _fromUTF16(input, repairing: true)!
_internalInvariant(!repaired, "Error present")

return unsafe contents.withUnsafeBufferPointer { unsafe String._uncheckedFromUTF8($0) }
return result
}

@inline(never) // slow path
Expand Down Expand Up @@ -311,7 +342,30 @@ extension String {
repair: Bool
) -> (String, repairsMade: Bool)?
where Input.Element == Encoding.CodeUnit {
guard _fastPath(encoding == Unicode.ASCII.self) else {
if encoding != Unicode.ASCII.self {
if encoding == Unicode.UTF16.self {
if let str = input.withContiguousStorageIfAvailable({ buffer in
unsafe _fromUTF16(
UnsafeRawBufferPointer(buffer).assumingMemoryBound(to: UInt16.self),
repairing: repair
)
}) {
return str
}
#if !$Embedded
if let contigBytes = input as? _HasContiguousBytes,
contigBytes._providesContiguousBytesNoCopy {
if let str = contigBytes.withUnsafeBytes({ buffer in
unsafe _fromUTF16(
buffer.assumingMemoryBound(to: UInt16.self),
repairing: repair
)
}) {
return str
}
}
#endif
}
return _slowFromCodeUnits(input, encoding: encoding, repair: repair)
}

Expand Down
Loading