Skip to content

Commit cfbd47d

Browse files
committed
Add an app for testing alt text
1 parent 8fdf171 commit cfbd47d

File tree

6 files changed

+496
-61
lines changed

6 files changed

+496
-61
lines changed

Modules/Sources/WordPressIntelligence/IntelligenceService.swift

Lines changed: 162 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -71,129 +71,250 @@ public enum IntelligenceService {
7171
/// Uses multiple Vision APIs to gather detailed information about the image:
7272
/// - Image classification for scene and object identification
7373
/// - Text recognition for readable content
74-
/// - Face detection for portrait photos
74+
/// - Face detection and landmarks for portraits
7575
/// - Human and animal detection for subjects
7676
/// - Saliency analysis for key regions of interest
7777
/// - Horizon detection for landscape orientation
7878
/// - Barcode detection for QR codes and barcodes
79+
/// - Document detection for papers and screenshots
7980
///
8081
/// - Parameter cgImage: The image to analyze
81-
/// - Returns: A comprehensive description of what's in the image
82+
/// - Returns: A JSON string with structured analysis data
8283
/// - Throws: If image analysis fails
8384
@available(iOS 26, *)
8485
public static func analyzeImage(_ cgImage: CGImage) async throws -> String {
8586
let startTime = CFAbsoluteTimeGetCurrent()
8687

87-
var descriptions: [String] = []
88-
8988
// Create all analysis requests
9089
let classifyRequest = VNClassifyImageRequest()
9190
let textRequest = VNRecognizeTextRequest()
9291
textRequest.recognitionLevel = .accurate
92+
textRequest.usesLanguageCorrection = true
9393

9494
let faceRequest = VNDetectFaceRectanglesRequest()
95+
let faceLandmarksRequest = VNDetectFaceLandmarksRequest()
9596
let humanRequest = VNDetectHumanRectanglesRequest()
9697
let animalRequest = VNRecognizeAnimalsRequest()
9798
let saliencyRequest = VNGenerateAttentionBasedSaliencyImageRequest()
9899
let horizonRequest = VNDetectHorizonRequest()
99100
let barcodeRequest = VNDetectBarcodesRequest()
101+
let documentRequest = VNDetectDocumentSegmentationRequest()
100102

101103
// Perform all requests
102104
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
103105
try handler.perform([
104106
classifyRequest,
105107
textRequest,
106108
faceRequest,
109+
faceLandmarksRequest,
107110
humanRequest,
108111
animalRequest,
109112
saliencyRequest,
110113
horizonRequest,
111-
barcodeRequest
114+
barcodeRequest,
115+
documentRequest
112116
])
113117

118+
// Build structured analysis result
119+
var analysis: [String: Any] = [:]
120+
121+
// Image dimensions
122+
analysis["imageSize"] = [
123+
"width": cgImage.width,
124+
"height": cgImage.height
125+
]
126+
127+
let aspectRatio = Double(cgImage.width) / Double(cgImage.height)
128+
if aspectRatio > 1.5 {
129+
analysis["orientation"] = "landscape"
130+
} else if aspectRatio < 0.7 {
131+
analysis["orientation"] = "portrait"
132+
} else {
133+
analysis["orientation"] = "square"
134+
}
135+
114136
// 1. Scene/Object Classification
115137
if let classifications = classifyRequest.results?.prefix(5) {
116138
let labels = classifications
117139
.filter { $0.confidence > 0.3 }
118-
.map { "\($0.identifier.replacingOccurrences(of: "_", with: " ")) (\(Int($0.confidence * 100))%)" }
140+
.map { [
141+
"label": $0.identifier.replacingOccurrences(of: "_", with: " "),
142+
"confidence": Int($0.confidence * 100)
143+
] as [String: Any] }
119144
if !labels.isEmpty {
120-
descriptions.append("Scene: \(labels.joined(separator: ", "))")
145+
analysis["sceneClassification"] = labels
121146
}
122147
}
123148

124-
// 2. Subjects - Faces
125-
if let faceObservations = faceRequest.results, !faceObservations.isEmpty {
126-
let faceCount = faceObservations.count
127-
let faceDesc = faceCount == 1 ? "1 face" : "\(faceCount) faces"
128-
descriptions.append("Subjects: \(faceDesc) detected")
149+
// 2. Face Detection with Landmarks
150+
var facesData: [[String: Any]] = []
151+
if let faceObservations = faceLandmarksRequest.results, !faceObservations.isEmpty {
152+
for face in faceObservations {
153+
var faceInfo: [String: Any] = [:]
154+
155+
// Position
156+
let bounds = face.boundingBox
157+
if bounds.origin.x < 0.33 {
158+
faceInfo["horizontalPosition"] = "left"
159+
} else if bounds.origin.x > 0.66 {
160+
faceInfo["horizontalPosition"] = "right"
161+
} else {
162+
faceInfo["horizontalPosition"] = "center"
163+
}
164+
165+
if bounds.origin.y < 0.33 {
166+
faceInfo["verticalPosition"] = "bottom"
167+
} else if bounds.origin.y > 0.66 {
168+
faceInfo["verticalPosition"] = "top"
169+
} else {
170+
faceInfo["verticalPosition"] = "middle"
171+
}
172+
173+
// Size (relative to image)
174+
let faceArea = bounds.width * bounds.height
175+
if faceArea > 0.25 {
176+
faceInfo["size"] = "closeup"
177+
} else if faceArea > 0.1 {
178+
faceInfo["size"] = "medium"
179+
} else {
180+
faceInfo["size"] = "distant"
181+
}
182+
183+
// Landmarks details
184+
if let landmarks = face.landmarks {
185+
var landmarksInfo: [String] = []
186+
if landmarks.faceContour != nil { landmarksInfo.append("face contour") }
187+
if landmarks.leftEye != nil { landmarksInfo.append("left eye") }
188+
if landmarks.rightEye != nil { landmarksInfo.append("right eye") }
189+
if landmarks.nose != nil { landmarksInfo.append("nose") }
190+
if landmarks.outerLips != nil { landmarksInfo.append("mouth") }
191+
faceInfo["detectedFeatures"] = landmarksInfo
192+
}
193+
194+
facesData.append(faceInfo)
195+
}
196+
analysis["faces"] = [
197+
"count": faceObservations.count,
198+
"details": facesData
199+
]
129200
}
130201

131-
// 3. Subjects - Humans (full body)
202+
// 3. Human Detection (full body)
132203
if let humanObservations = humanRequest.results, !humanObservations.isEmpty {
133-
let humanCount = humanObservations.count
134-
let humanDesc = humanCount == 1 ? "1 person" : "\(humanCount) people"
135-
136-
// Only add if we didn't already mention faces, or if there are more humans than faces
137-
if let faceCount = faceRequest.results?.count, humanCount > faceCount {
138-
descriptions.append("Additional subjects: \(humanDesc) visible")
139-
} else if faceRequest.results?.isEmpty ?? true {
140-
descriptions.append("Subjects: \(humanDesc) detected")
204+
let humanData = humanObservations.map { observation -> [String: Any] in
205+
let bounds = observation.boundingBox
206+
return [
207+
"confidence": Int(observation.confidence * 100),
208+
"size": bounds.width * bounds.height > 0.2 ? "prominent" : "background"
209+
]
141210
}
211+
analysis["humans"] = [
212+
"count": humanObservations.count,
213+
"details": humanData
214+
]
142215
}
143216

144217
// 4. Animals
145218
if let animalObservations = animalRequest.results, !animalObservations.isEmpty {
146219
let animals = animalObservations
147220
.filter { $0.confidence > 0.5 }
148-
.compactMap { observation -> String? in
221+
.compactMap { observation -> [String: Any]? in
149222
guard let label = observation.labels.first else { return nil }
150-
return "\(label.identifier) (\(Int(label.confidence * 100))%)"
223+
return [
224+
"type": label.identifier,
225+
"confidence": Int(label.confidence * 100)
226+
]
151227
}
152228
if !animals.isEmpty {
153-
descriptions.append("Animals: \(animals.joined(separator: ", "))")
229+
analysis["animals"] = animals
154230
}
155231
}
156232

157233
// 5. Saliency (regions of interest)
158234
if let saliencyObservations = saliencyRequest.results as? [VNSaliencyImageObservation],
159235
let observation = saliencyObservations.first,
160236
let salientObjects = observation.salientObjects, !salientObjects.isEmpty {
161-
descriptions.append("Key regions: \(salientObjects.count) area\(salientObjects.count == 1 ? "" : "s") of interest")
237+
let regions = salientObjects.map { object -> [String: Any] in
238+
let bounds = object.boundingBox
239+
var position = ""
240+
if bounds.origin.x < 0.33 {
241+
position = "left"
242+
} else if bounds.origin.x > 0.66 {
243+
position = "right"
244+
} else {
245+
position = "center"
246+
}
247+
return [
248+
"position": position,
249+
"confidence": Int(object.confidence * 100)
250+
]
251+
}
252+
analysis["regionsOfInterest"] = [
253+
"count": salientObjects.count,
254+
"regions": regions
255+
]
162256
}
163257

164-
// 6. Horizon detection (indicates landscape/orientation)
258+
// 6. Horizon detection
165259
if let horizonObservations = horizonRequest.results, let horizon = horizonObservations.first {
166260
let angle = horizon.angle * 180 / .pi
167-
if abs(angle) > 5 { // Only mention if horizon is noticeably tilted
168-
descriptions.append("Composition: horizon at \(Int(angle))° angle")
261+
if abs(angle) > 5 {
262+
analysis["horizon"] = [
263+
"angle": Int(angle),
264+
"tilt": angle > 0 ? "clockwise" : "counterclockwise"
265+
]
169266
}
170267
}
171268

172269
// 7. Text content
173270
if let textObservations = textRequest.results, !textObservations.isEmpty {
174-
let text = textObservations
175-
.compactMap { $0.topCandidates(1).first?.string }
176-
.joined(separator: " ")
177-
if !text.isEmpty {
178-
let truncatedText = String(text.prefix(100))
179-
descriptions.append("Text: \"\(truncatedText)\(text.count > 100 ? "..." : "")\"")
271+
let textLines = textObservations.compactMap { observation -> [String: Any]? in
272+
guard let text = observation.topCandidates(1).first?.string else { return nil }
273+
return [
274+
"text": text,
275+
"confidence": Int(observation.confidence * 100)
276+
]
277+
}
278+
if !textLines.isEmpty {
279+
let fullText = textLines.compactMap { $0["text"] as? String }.joined(separator: " ")
280+
analysis["text"] = [
281+
"fullText": String(fullText.prefix(500)),
282+
"lineCount": textLines.count,
283+
"lines": textLines.prefix(10)
284+
]
180285
}
181286
}
182287

183288
// 8. Barcodes/QR codes
184289
if let barcodeObservations = barcodeRequest.results, !barcodeObservations.isEmpty {
185-
let barcodeTypes = barcodeObservations.compactMap { $0.symbology.rawValue }
186-
if !barcodeTypes.isEmpty {
187-
descriptions.append("Codes: \(barcodeTypes.joined(separator: ", "))")
290+
let barcodes = barcodeObservations.compactMap { barcode -> [String: Any]? in
291+
var barcodeInfo: [String: Any] = [
292+
"type": barcode.symbology.rawValue
293+
]
294+
if let payload = barcode.payloadStringValue {
295+
barcodeInfo["payload"] = payload
296+
}
297+
return barcodeInfo
188298
}
299+
analysis["barcodes"] = barcodes
189300
}
190301

191-
let description = descriptions.isEmpty
192-
? "Image analyzed"
193-
: descriptions.joined(separator: "; ")
302+
// 9. Document detection
303+
if let documentObservations = documentRequest.results, !documentObservations.isEmpty {
304+
analysis["containsDocument"] = true
305+
analysis["documentCount"] = documentObservations.count
306+
}
307+
308+
// Convert to JSON string
309+
let jsonData = try JSONSerialization.data(withJSONObject: analysis, options: [.prettyPrinted, .sortedKeys])
310+
guard let jsonString = String(data: jsonData, encoding: .utf8) else {
311+
throw NSError(domain: "IntelligenceService", code: -1, userInfo: [
312+
NSLocalizedDescriptionKey: "Failed to convert analysis to JSON"
313+
])
314+
}
194315

195316
WPLogInfo("IntelligenceService.analyzeImage executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
196317

197-
return description
318+
return jsonString
198319
}
199320
}

Modules/Sources/WordPressIntelligence/UseCases/ImageAltTextGenerator.swift

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import WordPressShared
1717
public struct ImageAltTextGenerator {
1818
public var options: GenerationOptions
1919

20-
public init(options: GenerationOptions = GenerationOptions(temperature: 0.7)) {
20+
public init(options: GenerationOptions = GenerationOptions(temperature: 0.3)) {
2121
self.options = options
2222
}
2323

@@ -125,10 +125,12 @@ public struct ImageAltTextGenerator {
125125
public static var instructions: String {
126126
"""
127127
You are helping a WordPress user generate alt text for an image.
128-
Alt text should be concise, descriptive, and accessible for screen readers.
128+
Alt text should be descriptive and accessible for screen readers.
129129
130130
**Parameters**
131-
- IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT)
131+
- IMAGE_ANALYSIS: Structured JSON with comprehensive visual analysis (MOST IMPORTANT)
132+
The JSON includes: sceneClassification, faces (with position, size, features), humans, animals,
133+
text content, orientation, regions of interest, barcodes, and document detection
132134
- FILENAME: the image filename
133135
- FILE_TYPE: the file type/extension
134136
- DIMENSIONS: the image dimensions
@@ -137,12 +139,36 @@ public struct ImageAltTextGenerator {
137139
- DESCRIPTION: the image description (if available)
138140
139141
**Requirements**
140-
- Generate concise alt text (1-2 sentences, max 125 characters)
141-
- Prioritize IMAGE_ANALYSIS when describing what's in the image
142-
- Focus on what the image depicts, not decorative elements
142+
- For simple images: 1-2 sentences describing the main subject and action
143+
- For complex images (charts, infographics, screenshots): 2-3 sentences explaining key information
144+
- Parse the JSON IMAGE_ANALYSIS to understand:
145+
* Scene/subject: Use sceneClassification labels with highest confidence
146+
* People: Check faces/humans data for count, position (left/center/right), and shot type (closeup/medium/distant)
147+
* Spatial layout: Use position and orientation data to describe composition
148+
* Text: If text is prominent, include key text content verbatim
149+
* Documents/Screenshots: Mention if containsDocument is true
150+
- Prioritize information based on:
151+
1. Primary subject (faces, humans, animals, main scene)
152+
2. Actions or relationships between subjects
153+
3. Setting/context from scene classification
154+
4. Important text content (if present)
155+
- Use specific, concrete descriptions based on the data
143156
- Use simple, clear language
144-
- Do not include phrases like "image of" or "picture of"
145-
- Only output the alt text, nothing else
157+
- Do not include "image of", "picture of", or "photo of"
158+
- Do not describe decorative or insignificant details
159+
- For portraits: Include shot type (closeup/medium) and position if relevant
160+
- For screenshots: Mention it's a screenshot and describe the key visible element
161+
- For images with text: Include the most important text content
162+
163+
**Examples**
164+
Good: "Person smiling in closeup portrait with outdoor background"
165+
Good: "Three people standing left to right in conference room"
166+
Good: "Screenshot of WordPress editor with Publish button highlighted"
167+
Good: "Bar chart showing 45% increase in website traffic during Q3"
168+
Bad: "A person" (too vague, missing details from analysis)
169+
Bad: "Image of a chart" (avoid "image of", describe what the chart shows)
170+
171+
Only output the alt text, nothing else.
146172
"""
147173
}
148174

Sources/Miniature/ContentView.swift

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,19 @@ import JetpackStats
77

88
struct ContentView: View {
99
var body: some View {
10-
Text("Hello, world!")
10+
List {
11+
Section("Intelligence") {
12+
if #available(iOS 26, *) {
13+
NavigationLink("Image Alt Generator") {
14+
ImageAltGeneratorTestView()
15+
}
16+
} else {
17+
Text("Image Alt Generator (iOS 26+ required)")
18+
.foregroundStyle(.secondary)
19+
}
20+
}
21+
}
22+
.navigationTitle("Miniature")
1123
}
1224
}
1325

0 commit comments

Comments
 (0)