wordpress-mobile
diff --git a/‎Modules/Sources/WordPressIntelligence/IntelligenceService.swift‎
Lines changed: 162 additions & 41 deletions b/‎Modules/Sources/WordPressIntelligence/IntelligenceService.swift‎
Lines changed: 162 additions & 41 deletions
diff --git a/‎Modules/Sources/WordPressIntelligence/UseCases/ImageAltTextGenerator.swift‎
Lines changed: 34 additions & 8 deletions b/‎Modules/Sources/WordPressIntelligence/UseCases/ImageAltTextGenerator.swift‎
Lines changed: 34 additions & 8 deletions
diff --git a/‎Sources/Miniature/ContentView.swift‎
Lines changed: 13 additions & 1 deletion b/‎Sources/Miniature/ContentView.swift‎
Lines changed: 13 additions & 1 deletion
@@ -71,129 +71,250 @@ public enum IntelligenceService {
     /// Uses multiple Vision APIs to gather detailed information about the image:
     /// - Image classification for scene and object identification
     /// - Text recognition for readable content
-    /// - Face detection for portrait photos
+    /// - Face detection and landmarks for portraits
     /// - Human and animal detection for subjects
     /// - Saliency analysis for key regions of interest
     /// - Horizon detection for landscape orientation
     /// - Barcode detection for QR codes and barcodes
+    /// - Document detection for papers and screenshots
     ///
     /// - Parameter cgImage: The image to analyze
-    /// - Returns: A comprehensive description of what's in the image
+    /// - Returns: A JSON string with structured analysis data
     /// - Throws: If image analysis fails
     @available(iOS 26, *)
     public static func analyzeImage(_ cgImage: CGImage) async throws -> String {
         let startTime = CFAbsoluteTimeGetCurrent()
 
-        var descriptions: [String] = []
-
         // Create all analysis requests
         let classifyRequest = VNClassifyImageRequest()
         let textRequest = VNRecognizeTextRequest()
         textRequest.recognitionLevel = .accurate
+        textRequest.usesLanguageCorrection = true
 
         let faceRequest = VNDetectFaceRectanglesRequest()
+        let faceLandmarksRequest = VNDetectFaceLandmarksRequest()
         let humanRequest = VNDetectHumanRectanglesRequest()
         let animalRequest = VNRecognizeAnimalsRequest()
         let saliencyRequest = VNGenerateAttentionBasedSaliencyImageRequest()
         let horizonRequest = VNDetectHorizonRequest()
         let barcodeRequest = VNDetectBarcodesRequest()
+        let documentRequest = VNDetectDocumentSegmentationRequest()
 
         // Perform all requests
         let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
         try handler.perform([
             classifyRequest,
             textRequest,
             faceRequest,
+            faceLandmarksRequest,
             humanRequest,
             animalRequest,
             saliencyRequest,
             horizonRequest,
-            barcodeRequest
+            barcodeRequest,
+            documentRequest
         ])
 
+        // Build structured analysis result
+        var analysis: [String: Any] = [:]
+
+        // Image dimensions
+        analysis["imageSize"] = [
+            "width": cgImage.width,
+            "height": cgImage.height
+        ]
+
+        let aspectRatio = Double(cgImage.width) / Double(cgImage.height)
+        if aspectRatio > 1.5 {
+            analysis["orientation"] = "landscape"
+        } else if aspectRatio < 0.7 {
+            analysis["orientation"] = "portrait"
+        } else {
+            analysis["orientation"] = "square"
+        }
+
         // 1. Scene/Object Classification
         if let classifications = classifyRequest.results?.prefix(5) {
             let labels = classifications
                 .filter { $0.confidence > 0.3 }
-                .map { "\($0.identifier.replacingOccurrences(of: "_", with: " ")) (\(Int($0.confidence * 100))%)" }
+                .map { [
+                    "label": $0.identifier.replacingOccurrences(of: "_", with: " "),
+                    "confidence": Int($0.confidence * 100)
+                ] as [String: Any] }
             if !labels.isEmpty {
-                descriptions.append("Scene: \(labels.joined(separator: ", "))")
+                analysis["sceneClassification"] = labels
             }
         }
 
-        // 2. Subjects - Faces
-        if let faceObservations = faceRequest.results, !faceObservations.isEmpty {
-            let faceCount = faceObservations.count
-            let faceDesc = faceCount == 1 ? "1 face" : "\(faceCount) faces"
-            descriptions.append("Subjects: \(faceDesc) detected")
+        // 2. Face Detection with Landmarks
+        var facesData: [[String: Any]] = []
+        if let faceObservations = faceLandmarksRequest.results, !faceObservations.isEmpty {
+            for face in faceObservations {
+                var faceInfo: [String: Any] = [:]
+
+                // Position
+                let bounds = face.boundingBox
+                if bounds.origin.x < 0.33 {
+                    faceInfo["horizontalPosition"] = "left"
+                } else if bounds.origin.x > 0.66 {
+                    faceInfo["horizontalPosition"] = "right"
+                } else {
+                    faceInfo["horizontalPosition"] = "center"
+                }
+
+                if bounds.origin.y < 0.33 {
+                    faceInfo["verticalPosition"] = "bottom"
+                } else if bounds.origin.y > 0.66 {
+                    faceInfo["verticalPosition"] = "top"
+                } else {
+                    faceInfo["verticalPosition"] = "middle"
+                }
+
+                // Size (relative to image)
+                let faceArea = bounds.width * bounds.height
+                if faceArea > 0.25 {
+                    faceInfo["size"] = "closeup"
+                } else if faceArea > 0.1 {
+                    faceInfo["size"] = "medium"
+                } else {
+                    faceInfo["size"] = "distant"
+                }
+
+                // Landmarks details
+                if let landmarks = face.landmarks {
+                    var landmarksInfo: [String] = []
+                    if landmarks.faceContour != nil { landmarksInfo.append("face contour") }
+                    if landmarks.leftEye != nil { landmarksInfo.append("left eye") }
+                    if landmarks.rightEye != nil { landmarksInfo.append("right eye") }
+                    if landmarks.nose != nil { landmarksInfo.append("nose") }
+                    if landmarks.outerLips != nil { landmarksInfo.append("mouth") }
+                    faceInfo["detectedFeatures"] = landmarksInfo
+                }
+
+                facesData.append(faceInfo)
+            }
+            analysis["faces"] = [
+                "count": faceObservations.count,
+                "details": facesData
+            ]
         }
 
-        // 3. Subjects - Humans (full body)
+        // 3. Human Detection (full body)
         if let humanObservations = humanRequest.results, !humanObservations.isEmpty {
-            let humanCount = humanObservations.count
-            let humanDesc = humanCount == 1 ? "1 person" : "\(humanCount) people"
-
-            // Only add if we didn't already mention faces, or if there are more humans than faces
-            if let faceCount = faceRequest.results?.count, humanCount > faceCount {
-                descriptions.append("Additional subjects: \(humanDesc) visible")
-            } else if faceRequest.results?.isEmpty ?? true {
-                descriptions.append("Subjects: \(humanDesc) detected")
+            let humanData = humanObservations.map { observation -> [String: Any] in
+                let bounds = observation.boundingBox
+                return [
+                    "confidence": Int(observation.confidence * 100),
+                    "size": bounds.width * bounds.height > 0.2 ? "prominent" : "background"
+                ]
             }
+            analysis["humans"] = [
+                "count": humanObservations.count,
+                "details": humanData
+            ]
         }
 
         // 4. Animals
         if let animalObservations = animalRequest.results, !animalObservations.isEmpty {
             let animals = animalObservations
                 .filter { $0.confidence > 0.5 }
-                .compactMap { observation -> String? in
+                .compactMap { observation -> [String: Any]? in
                     guard let label = observation.labels.first else { return nil }
-                    return "\(label.identifier) (\(Int(label.confidence * 100))%)"
+                    return [
+                        "type": label.identifier,
+                        "confidence": Int(label.confidence * 100)
+                    ]
                 }
             if !animals.isEmpty {
-                descriptions.append("Animals: \(animals.joined(separator: ", "))")
+                analysis["animals"] = animals
             }
         }
 
         // 5. Saliency (regions of interest)
         if let saliencyObservations = saliencyRequest.results as? [VNSaliencyImageObservation],
            let observation = saliencyObservations.first,
            let salientObjects = observation.salientObjects, !salientObjects.isEmpty {
-            descriptions.append("Key regions: \(salientObjects.count) area\(salientObjects.count == 1 ? "" : "s") of interest")
+            let regions = salientObjects.map { object -> [String: Any] in
+                let bounds = object.boundingBox
+                var position = ""
+                if bounds.origin.x < 0.33 {
+                    position = "left"
+                } else if bounds.origin.x > 0.66 {
+                    position = "right"
+                } else {
+                    position = "center"
+                }
+                return [
+                    "position": position,
+                    "confidence": Int(object.confidence * 100)
+                ]
+            }
+            analysis["regionsOfInterest"] = [
+                "count": salientObjects.count,
+                "regions": regions
+            ]
         }
 
-        // 6. Horizon detection (indicates landscape/orientation)
+        // 6. Horizon detection
         if let horizonObservations = horizonRequest.results, let horizon = horizonObservations.first {
             let angle = horizon.angle * 180 / .pi
-            if abs(angle) > 5 { // Only mention if horizon is noticeably tilted
-                descriptions.append("Composition: horizon at \(Int(angle))° angle")
+            if abs(angle) > 5 {
+                analysis["horizon"] = [
+                    "angle": Int(angle),
+                    "tilt": angle > 0 ? "clockwise" : "counterclockwise"
+                ]
             }
         }
 
         // 7. Text content
         if let textObservations = textRequest.results, !textObservations.isEmpty {
-            let text = textObservations
-                .compactMap { $0.topCandidates(1).first?.string }
-                .joined(separator: " ")
-            if !text.isEmpty {
-                let truncatedText = String(text.prefix(100))
-                descriptions.append("Text: \"\(truncatedText)\(text.count > 100 ? "..." : "")\"")
+            let textLines = textObservations.compactMap { observation -> [String: Any]? in
+                guard let text = observation.topCandidates(1).first?.string else { return nil }
+                return [
+                    "text": text,
+                    "confidence": Int(observation.confidence * 100)
+                ]
+            }
+            if !textLines.isEmpty {
+                let fullText = textLines.compactMap { $0["text"] as? String }.joined(separator: " ")
+                analysis["text"] = [
+                    "fullText": String(fullText.prefix(500)),
+                    "lineCount": textLines.count,
+                    "lines": textLines.prefix(10)
+                ]
             }
         }
 
         // 8. Barcodes/QR codes
         if let barcodeObservations = barcodeRequest.results, !barcodeObservations.isEmpty {
-            let barcodeTypes = barcodeObservations.compactMap { $0.symbology.rawValue }
-            if !barcodeTypes.isEmpty {
-                descriptions.append("Codes: \(barcodeTypes.joined(separator: ", "))")
+            let barcodes = barcodeObservations.compactMap { barcode -> [String: Any]? in
+                var barcodeInfo: [String: Any] = [
+                    "type": barcode.symbology.rawValue
+                ]
+                if let payload = barcode.payloadStringValue {
+                    barcodeInfo["payload"] = payload
+                }
+                return barcodeInfo
             }
+            analysis["barcodes"] = barcodes
         }
 
-        let description = descriptions.isEmpty
-            ? "Image analyzed"
-            : descriptions.joined(separator: "; ")
+        // 9. Document detection
+        if let documentObservations = documentRequest.results, !documentObservations.isEmpty {
+            analysis["containsDocument"] = true
+            analysis["documentCount"] = documentObservations.count
+        }
+
+        // Convert to JSON string
+        let jsonData = try JSONSerialization.data(withJSONObject: analysis, options: [.prettyPrinted, .sortedKeys])
+        guard let jsonString = String(data: jsonData, encoding: .utf8) else {
+            throw NSError(domain: "IntelligenceService", code: -1, userInfo: [
+                NSLocalizedDescriptionKey: "Failed to convert analysis to JSON"
+            ])
+        }
 
         WPLogInfo("IntelligenceService.analyzeImage executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
 
-        return description
+        return jsonString
     }
 }
@@ -17,7 +17,7 @@ import WordPressShared
 public struct ImageAltTextGenerator {
     public var options: GenerationOptions
 
-    public init(options: GenerationOptions = GenerationOptions(temperature: 0.7)) {
+    public init(options: GenerationOptions = GenerationOptions(temperature: 0.3)) {
         self.options = options
     }
 
@@ -125,10 +125,12 @@ public struct ImageAltTextGenerator {
     public static var instructions: String {
         """
         You are helping a WordPress user generate alt text for an image.
-        Alt text should be concise, descriptive, and accessible for screen readers.
+        Alt text should be descriptive and accessible for screen readers.
 
         **Parameters**
-        - IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT)
+        - IMAGE_ANALYSIS: Structured JSON with comprehensive visual analysis (MOST IMPORTANT)
+          The JSON includes: sceneClassification, faces (with position, size, features), humans, animals,
+          text content, orientation, regions of interest, barcodes, and document detection
         - FILENAME: the image filename
         - FILE_TYPE: the file type/extension
         - DIMENSIONS: the image dimensions
@@ -137,12 +139,36 @@ public struct ImageAltTextGenerator {
         - DESCRIPTION: the image description (if available)
 
         **Requirements**
-        - Generate concise alt text (1-2 sentences, max 125 characters)
-        - Prioritize IMAGE_ANALYSIS when describing what's in the image
-        - Focus on what the image depicts, not decorative elements
+        - For simple images: 1-2 sentences describing the main subject and action
+        - For complex images (charts, infographics, screenshots): 2-3 sentences explaining key information
+        - Parse the JSON IMAGE_ANALYSIS to understand:
+          * Scene/subject: Use sceneClassification labels with highest confidence
+          * People: Check faces/humans data for count, position (left/center/right), and shot type (closeup/medium/distant)
+          * Spatial layout: Use position and orientation data to describe composition
+          * Text: If text is prominent, include key text content verbatim
+          * Documents/Screenshots: Mention if containsDocument is true
+        - Prioritize information based on:
+          1. Primary subject (faces, humans, animals, main scene)
+          2. Actions or relationships between subjects
+          3. Setting/context from scene classification
+          4. Important text content (if present)
+        - Use specific, concrete descriptions based on the data
         - Use simple, clear language
-        - Do not include phrases like "image of" or "picture of"
-        - Only output the alt text, nothing else
+        - Do not include "image of", "picture of", or "photo of"
+        - Do not describe decorative or insignificant details
+        - For portraits: Include shot type (closeup/medium) and position if relevant
+        - For screenshots: Mention it's a screenshot and describe the key visible element
+        - For images with text: Include the most important text content
+
+        **Examples**
+        Good: "Person smiling in closeup portrait with outdoor background"
+        Good: "Three people standing left to right in conference room"
+        Good: "Screenshot of WordPress editor with Publish button highlighted"
+        Good: "Bar chart showing 45% increase in website traffic during Q3"
+        Bad: "A person" (too vague, missing details from analysis)
+        Bad: "Image of a chart" (avoid "image of", describe what the chart shows)
+
+        Only output the alt text, nothing else.
         """
     }
 
 
@@ -7,7 +7,19 @@ import JetpackStats
 
 struct ContentView: View {
     var body: some View {
-        Text("Hello, world!")
+        List {
+            Section("Intelligence") {
+                if #available(iOS 26, *) {
+                    NavigationLink("Image Alt Generator") {
+                        ImageAltGeneratorTestView()
+                    }
+                } else {
+                    Text("Image Alt Generator (iOS 26+ required)")
+                        .foregroundStyle(.secondary)
+                }
+            }
+        }
+        .navigationTitle("Miniature")
     }
 }