Skip to content

Commit 8fdf171

Browse files
committed
Add caption and alt text generation for media
1 parent a45969e commit 8fdf171

File tree

10 files changed

+1485
-5
lines changed

10 files changed

+1485
-5
lines changed

Modules/Package.swift

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,11 @@ let package = Package(
256256
.testTarget(name: "WordPressSharedObjCTests", dependencies: [.target(name: "WordPressShared"), .target(name: "WordPressTesting")], swiftSettings: [.swiftLanguageMode(.v5)]),
257257
.testTarget(name: "WordPressUIUnitTests", dependencies: [.target(name: "WordPressUI")], swiftSettings: [.swiftLanguageMode(.v5)]),
258258
.testTarget(name: "WordPressCoreTests", dependencies: [.target(name: "WordPressCore")]),
259-
.testTarget(name: "WordPressIntelligenceTests", dependencies: [.target(name: "WordPressIntelligence")])
259+
.testTarget(
260+
name: "WordPressIntelligenceTests",
261+
dependencies: [.target(name: "WordPressIntelligence")],
262+
resources: [.process("Resources")]
263+
)
260264
]
261265
)
262266

Modules/Sources/WordPressIntelligence/IntelligenceService.swift

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import Foundation
22
import FoundationModels
33
import NaturalLanguage
4+
import Vision
5+
import UIKit
6+
import WordPressShared
47

58
public enum IntelligenceService {
69
/// Maximum context size for language model sessions (in tokens).
@@ -62,4 +65,135 @@ public enum IntelligenceService {
6265

6366
return languageCode.rawValue
6467
}
68+
69+
/// Analyzes an image using Vision framework to extract comprehensive visual information.
70+
///
71+
/// Uses multiple Vision APIs to gather detailed information about the image:
72+
/// - Image classification for scene and object identification
73+
/// - Text recognition for readable content
74+
/// - Face detection for portrait photos
75+
/// - Human and animal detection for subjects
76+
/// - Saliency analysis for key regions of interest
77+
/// - Horizon detection for landscape orientation
78+
/// - Barcode detection for QR codes and barcodes
79+
///
80+
/// - Parameter cgImage: The image to analyze
81+
/// - Returns: A comprehensive description of what's in the image
82+
/// - Throws: If image analysis fails
83+
@available(iOS 26, *)
84+
public static func analyzeImage(_ cgImage: CGImage) async throws -> String {
85+
let startTime = CFAbsoluteTimeGetCurrent()
86+
87+
var descriptions: [String] = []
88+
89+
// Create all analysis requests
90+
let classifyRequest = VNClassifyImageRequest()
91+
let textRequest = VNRecognizeTextRequest()
92+
textRequest.recognitionLevel = .accurate
93+
94+
let faceRequest = VNDetectFaceRectanglesRequest()
95+
let humanRequest = VNDetectHumanRectanglesRequest()
96+
let animalRequest = VNRecognizeAnimalsRequest()
97+
let saliencyRequest = VNGenerateAttentionBasedSaliencyImageRequest()
98+
let horizonRequest = VNDetectHorizonRequest()
99+
let barcodeRequest = VNDetectBarcodesRequest()
100+
101+
// Perform all requests
102+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
103+
try handler.perform([
104+
classifyRequest,
105+
textRequest,
106+
faceRequest,
107+
humanRequest,
108+
animalRequest,
109+
saliencyRequest,
110+
horizonRequest,
111+
barcodeRequest
112+
])
113+
114+
// 1. Scene/Object Classification
115+
if let classifications = classifyRequest.results?.prefix(5) {
116+
let labels = classifications
117+
.filter { $0.confidence > 0.3 }
118+
.map { "\($0.identifier.replacingOccurrences(of: "_", with: " ")) (\(Int($0.confidence * 100))%)" }
119+
if !labels.isEmpty {
120+
descriptions.append("Scene: \(labels.joined(separator: ", "))")
121+
}
122+
}
123+
124+
// 2. Subjects - Faces
125+
if let faceObservations = faceRequest.results, !faceObservations.isEmpty {
126+
let faceCount = faceObservations.count
127+
let faceDesc = faceCount == 1 ? "1 face" : "\(faceCount) faces"
128+
descriptions.append("Subjects: \(faceDesc) detected")
129+
}
130+
131+
// 3. Subjects - Humans (full body)
132+
if let humanObservations = humanRequest.results, !humanObservations.isEmpty {
133+
let humanCount = humanObservations.count
134+
let humanDesc = humanCount == 1 ? "1 person" : "\(humanCount) people"
135+
136+
// Only add if we didn't already mention faces, or if there are more humans than faces
137+
if let faceCount = faceRequest.results?.count, humanCount > faceCount {
138+
descriptions.append("Additional subjects: \(humanDesc) visible")
139+
} else if faceRequest.results?.isEmpty ?? true {
140+
descriptions.append("Subjects: \(humanDesc) detected")
141+
}
142+
}
143+
144+
// 4. Animals
145+
if let animalObservations = animalRequest.results, !animalObservations.isEmpty {
146+
let animals = animalObservations
147+
.filter { $0.confidence > 0.5 }
148+
.compactMap { observation -> String? in
149+
guard let label = observation.labels.first else { return nil }
150+
return "\(label.identifier) (\(Int(label.confidence * 100))%)"
151+
}
152+
if !animals.isEmpty {
153+
descriptions.append("Animals: \(animals.joined(separator: ", "))")
154+
}
155+
}
156+
157+
// 5. Saliency (regions of interest)
158+
if let saliencyObservations = saliencyRequest.results as? [VNSaliencyImageObservation],
159+
let observation = saliencyObservations.first,
160+
let salientObjects = observation.salientObjects, !salientObjects.isEmpty {
161+
descriptions.append("Key regions: \(salientObjects.count) area\(salientObjects.count == 1 ? "" : "s") of interest")
162+
}
163+
164+
// 6. Horizon detection (indicates landscape/orientation)
165+
if let horizonObservations = horizonRequest.results, let horizon = horizonObservations.first {
166+
let angle = horizon.angle * 180 / .pi
167+
if abs(angle) > 5 { // Only mention if horizon is noticeably tilted
168+
descriptions.append("Composition: horizon at \(Int(angle))° angle")
169+
}
170+
}
171+
172+
// 7. Text content
173+
if let textObservations = textRequest.results, !textObservations.isEmpty {
174+
let text = textObservations
175+
.compactMap { $0.topCandidates(1).first?.string }
176+
.joined(separator: " ")
177+
if !text.isEmpty {
178+
let truncatedText = String(text.prefix(100))
179+
descriptions.append("Text: \"\(truncatedText)\(text.count > 100 ? "..." : "")\"")
180+
}
181+
}
182+
183+
// 8. Barcodes/QR codes
184+
if let barcodeObservations = barcodeRequest.results, !barcodeObservations.isEmpty {
185+
let barcodeTypes = barcodeObservations.compactMap { $0.symbology.rawValue }
186+
if !barcodeTypes.isEmpty {
187+
descriptions.append("Codes: \(barcodeTypes.joined(separator: ", "))")
188+
}
189+
}
190+
191+
let description = descriptions.isEmpty
192+
? "Image analyzed"
193+
: descriptions.joined(separator: "; ")
194+
195+
WPLogInfo("IntelligenceService.analyzeImage executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
196+
197+
return description
198+
}
65199
}
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import Foundation
2+
3+
/// Metadata for generating alt text and captions for media items.
4+
public struct MediaMetadata {
5+
public let filename: String?
6+
public let title: String?
7+
public let caption: String?
8+
public let description: String?
9+
public let altText: String?
10+
public let fileType: String?
11+
public let dimensions: String?
12+
public let imageAnalysis: String?
13+
14+
public init(
15+
filename: String? = nil,
16+
title: String? = nil,
17+
caption: String? = nil,
18+
description: String? = nil,
19+
altText: String? = nil,
20+
fileType: String? = nil,
21+
dimensions: String? = nil,
22+
imageAnalysis: String? = nil
23+
) {
24+
self.filename = filename
25+
self.title = title
26+
self.caption = caption
27+
self.description = description
28+
self.altText = altText
29+
self.fileType = fileType
30+
self.dimensions = dimensions
31+
self.imageAnalysis = imageAnalysis
32+
}
33+
34+
var hasContent: Bool {
35+
return [filename, title, caption, description, altText, fileType, dimensions, imageAnalysis]
36+
.contains(where: { !($0?.isEmpty ?? true) })
37+
}
38+
}
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
import Foundation
2+
import FoundationModels
3+
import UIKit
4+
import WordPressShared
5+
6+
/// Alt text generation for media items.
7+
///
8+
/// Generates concise, descriptive, and accessible alt text for images based on
9+
/// visual analysis and available metadata.
10+
///
11+
/// Example usage:
12+
/// ```swift
13+
/// let generator = ImageAltTextGenerator()
14+
/// let altText = try await generator.generate(metadata: metadata)
15+
/// ```
16+
@available(iOS 26, *)
17+
public struct ImageAltTextGenerator {
18+
public var options: GenerationOptions
19+
20+
public init(options: GenerationOptions = GenerationOptions(temperature: 0.7)) {
21+
self.options = options
22+
}
23+
24+
/// Generates alt text for a media item.
25+
///
26+
/// - Parameter metadata: The media metadata to use for generation
27+
/// - Returns: Generated alt text
28+
/// - Throws: If metadata is insufficient or generation fails
29+
public func generate(metadata: MediaMetadata) async throws -> String {
30+
guard metadata.hasContent else {
31+
throw NSError(domain: "IntelligenceService", code: -1, userInfo: [
32+
NSLocalizedDescriptionKey: "Insufficient metadata to generate alt text. Please add a filename, title, or description first."
33+
])
34+
}
35+
36+
let startTime = CFAbsoluteTimeGetCurrent()
37+
let session = makeSession()
38+
let prompt = makePrompt(metadata: metadata)
39+
40+
let response = try await session.respond(to: prompt, options: options)
41+
42+
WPLogInfo("ImageAltTextGenerator executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
43+
44+
return response.content.trimmingCharacters(in: .whitespacesAndNewlines)
45+
}
46+
47+
/// Generates alt text for an image with automatic Vision analysis.
48+
///
49+
/// This convenience method automatically analyzes the image using Vision framework
50+
/// and generates alt text based on the analysis combined with provided metadata.
51+
///
52+
/// - Parameters:
53+
/// - cgImage: The image to analyze and generate alt text for
54+
/// - metadata: Additional metadata (filename, title, etc.). The imageAnalysis field will be populated automatically.
55+
/// - Returns: Generated alt text
56+
/// - Throws: If image analysis or generation fails
57+
public func generate(cgImage: CGImage, metadata: MediaMetadata = MediaMetadata()) async throws -> String {
58+
let imageAnalysis = try await IntelligenceService.analyzeImage(cgImage)
59+
60+
let metadataWithAnalysis = MediaMetadata(
61+
filename: metadata.filename,
62+
title: metadata.title,
63+
caption: metadata.caption,
64+
description: metadata.description,
65+
altText: metadata.altText,
66+
fileType: metadata.fileType,
67+
dimensions: metadata.dimensions,
68+
imageAnalysis: imageAnalysis
69+
)
70+
71+
return try await generate(metadata: metadataWithAnalysis)
72+
}
73+
74+
/// Generates alt text for an image with automatic Vision analysis.
75+
///
76+
/// This convenience method automatically analyzes the image using Vision framework
77+
/// and generates alt text based on the analysis combined with provided metadata.
78+
///
79+
/// - Parameters:
80+
/// - image: The UIImage to analyze and generate alt text for
81+
/// - metadata: Additional metadata (filename, title, etc.). The imageAnalysis field will be populated automatically.
82+
/// - Returns: Generated alt text
83+
/// - Throws: If the image cannot be converted to CGImage, or if analysis/generation fails
84+
public func generate(image: UIImage, metadata: MediaMetadata = MediaMetadata()) async throws -> String {
85+
guard let cgImage = image.cgImage else {
86+
throw NSError(domain: "IntelligenceService", code: -2, userInfo: [
87+
NSLocalizedDescriptionKey: "Unable to convert UIImage to CGImage"
88+
])
89+
}
90+
return try await generate(cgImage: cgImage, metadata: metadata)
91+
}
92+
93+
/// Generates alt text for image data with automatic Vision analysis.
94+
///
95+
/// This convenience method automatically analyzes the image using Vision framework
96+
/// and generates alt text based on the analysis combined with provided metadata.
97+
///
98+
/// - Parameters:
99+
/// - imageData: The image data to analyze and generate alt text for
100+
/// - metadata: Additional metadata (filename, title, etc.). The imageAnalysis field will be populated automatically.
101+
/// - Returns: Generated alt text
102+
/// - Throws: If the data cannot be converted to an image, or if analysis/generation fails
103+
public func generate(imageData: Data, metadata: MediaMetadata = MediaMetadata()) async throws -> String {
104+
guard let image = UIImage(data: imageData) else {
105+
throw NSError(domain: "IntelligenceService", code: -3, userInfo: [
106+
NSLocalizedDescriptionKey: "Unable to create UIImage from data"
107+
])
108+
}
109+
return try await generate(image: image, metadata: metadata)
110+
}
111+
112+
// MARK: - Session & Prompt Building
113+
114+
/// Creates a language model session configured for alt text generation.
115+
///
116+
/// - Returns: Configured session with instructions
117+
public func makeSession() -> LanguageModelSession {
118+
LanguageModelSession(
119+
model: .init(guardrails: .permissiveContentTransformations),
120+
instructions: Self.instructions
121+
)
122+
}
123+
124+
/// Instructions for the language model on how to generate alt text.
125+
public static var instructions: String {
126+
"""
127+
You are helping a WordPress user generate alt text for an image.
128+
Alt text should be concise, descriptive, and accessible for screen readers.
129+
130+
**Parameters**
131+
- IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT)
132+
- FILENAME: the image filename
133+
- FILE_TYPE: the file type/extension
134+
- DIMENSIONS: the image dimensions
135+
- TITLE: the image title (if available)
136+
- CAPTION: the image caption (if available)
137+
- DESCRIPTION: the image description (if available)
138+
139+
**Requirements**
140+
- Generate concise alt text (1-2 sentences, max 125 characters)
141+
- Prioritize IMAGE_ANALYSIS when describing what's in the image
142+
- Focus on what the image depicts, not decorative elements
143+
- Use simple, clear language
144+
- Do not include phrases like "image of" or "picture of"
145+
- Only output the alt text, nothing else
146+
"""
147+
}
148+
149+
/// Builds the prompt for generating alt text.
150+
///
151+
/// - Parameter metadata: The media metadata
152+
/// - Returns: Formatted prompt string ready for the language model
153+
public func makePrompt(metadata: MediaMetadata) -> String {
154+
var contextParts: [String] = []
155+
156+
if let imageAnalysis = metadata.imageAnalysis, !imageAnalysis.isEmpty {
157+
contextParts.append("IMAGE_ANALYSIS: '\(imageAnalysis)'")
158+
}
159+
if let filename = metadata.filename, !filename.isEmpty {
160+
contextParts.append("FILENAME: '\(filename)'")
161+
}
162+
if let fileType = metadata.fileType, !fileType.isEmpty {
163+
contextParts.append("FILE_TYPE: '\(fileType)'")
164+
}
165+
if let dimensions = metadata.dimensions, !dimensions.isEmpty {
166+
contextParts.append("DIMENSIONS: '\(dimensions)'")
167+
}
168+
if let title = metadata.title, !title.isEmpty {
169+
contextParts.append("TITLE: '\(title)'")
170+
}
171+
if let caption = metadata.caption, !caption.isEmpty {
172+
contextParts.append("CAPTION: '\(caption)'")
173+
}
174+
if let description = metadata.description, !description.isEmpty {
175+
contextParts.append("DESCRIPTION: '\(description)'")
176+
}
177+
178+
return """
179+
Generate alt text for an image with the following information:
180+
181+
\(contextParts.joined(separator: "\n"))
182+
"""
183+
}
184+
}
185+
186+
@available(iOS 26, *)
187+
extension IntelligenceService {
188+
/// Generates alt text for a media item based on available metadata.
189+
///
190+
/// - Parameter metadata: The media metadata to use for generation
191+
/// - Returns: Generated alt text
192+
/// - Throws: If metadata is insufficient or generation fails
193+
public func generateAltText(metadata: MediaMetadata) async throws -> String {
194+
try await ImageAltTextGenerator().generate(metadata: metadata)
195+
}
196+
}

0 commit comments

Comments
 (0)