Upgrade to Pro — share decks privately, control downloads, hide ads and more …

AIを活用したレシート読み取り機能の開発から得られた実践知 / AI Receipt Scan...

Avatar for rockname rockname
September 19, 2025

AIを活用したレシート読み取り機能の開発から得られた実践知 / AI Receipt Scan Practice

Avatar for rockname

rockname

September 19, 2025
Tweet

More Decks by rockname

Other Decks in Programming

Transcript

  1. 㲔 Smart, Secure, Mobile Super powered Banking and Living. 

    --.Λ׆༻ͨ͠ػೳΛ։ൃͨ͜͠ͱ͕͋Δਓ🙋
  2. X

  3. let session = AVCaptureSession() func configureSession() { session.beginConfiguration() defer {

    session.commitConfiguration() } setupInput() setupOutput() } func setupInput() { … } func setupOutput() { … }
  4. func configureSession() { … setupInput() setupOutput() } func setupInput() {

    let device = AVCaptureDevice.default( .builtInWideAngleCamera, for: .video, position: .back )! let deviceInput = try! AVCaptureDeviceInput(device: device) if session.canAddInput(deviceInput) { session.addInput(deviceInput) } } func setupOutput() { … }
  5. func configureSession() { … setupInput() setupOutput() } func setupInput() {

    … } func setupOutput() { let videoDataOutput = AVCaptureVideoDataOutput() videoDataOutput.setSampleBufferDelegate( self, queue: sampleBufferQueue ) if session.canAddOutput(videoDataOutput) { session.addOutput(videoDataOutput) } videoDataOutput.connection(with: .video)?.isEnabled = true }
  6. // AVCaptureVideoDataOutputSampleBufferDelegate method func captureOutput( _ output: AVCaptureOutput, didOutput sampleBuffer:

    CMSampleBuffer, from connection: AVCaptureConnection ) { // TODO: ϏσΦϑϨʔϜͷϋϯυϦϯά }
  7. func setupInput() { … let minimumSubjectDistance = minimumSubjectDistance( fieldOfView: device.activeFormat.videoFieldOfView,

    minimumReceiptWidth: 60, // ૝ఆ͞ΕΔ࠷খͷϨγʔτ୹ล (mm) previewFillPercentage: 0.8 // ϓϨϏϡʔͷ෯ʹର͢Δඃࣸମͷׂ߹ ) } func minimumSubjectDistance( fieldOfView: Float, minimumReceiptWidth: Float, previewFillPercentage: Float ) -> Float { let radians = degreesToRadians(fieldOfView / 2.0) let filledSize = minimumReceiptWidth / previewFillPercentage return filledSize / tan(radians) }
  8. func setupInput() { … let deviceMinimumFocusDistance = Float(device.minimumFocusDistance) if minimumSubjectDistance

    < deviceMinimumFocusDistance { let zoomFactor = deviceMinimumFocusDistance / minimumSubjectDistance device.videoZoomFactor = CGFloat(zoomFactor) } }
  9. actor ReceiptDetector { func processVideoFrame(sampleBuffer: CMSampleBuffer) async { await startToTrackReceiptRect(sampleBuffer:

    sampleBuffer) } private func startToTrackReceiptRect(sampleBuffer: CMSampleBuffer) async { do { let request = DetectDocumentSegmentationRequest() let detectedDocumentObservation = try await request.perform( on: sampleBuffer, orientation: .right ) } catch { … } } }
  10. actor ReceiptDetector { private var trackRectangleRequest: TrackRectangleRequest? … private func

    startToTrackReceiptRect(sampleBuffer: CMSampleBuffer) async { … guard let detectedDocumentObservation, detectedDocumentObservation.confidence > 0.5 else { return } let trackingRequest = TrackRectangleRequest( detectedRectangle: detectedDocumentObservation, .revision1 ) trackingRequest.trackingLevel = .accurate self.trackRectangleRequest = trackingRequest }
  11. actor ReceiptDetector { private var trackRectangleRequest: TrackRectangleRequest? func processVideoFrame(sampleBuffer: CMSampleBuffer)

    async { guard let trackRectangleRequest else { await startToTrackReceiptRect(sampleBuffer: sampleBuffer) return } await trackReceiptRect( trackRectangleRequest: trackRectangleRequest, sampleBuffer: sampleBuffer ) } private func trackReceiptRect(…) async { … } }
  12. actor ReceiptDetector { … private func trackReceiptRect( trackRectangleRequest: TrackRectangleRequest, sampleBuffer:

    CMSampleBuffer ) async { do { let rectangleObservation = try await trackRectangleRequest.perform( on: sampleBuffer, orientation: .right ) // ✅ ݕग़ҐஔͱαΠζ͕ۙࣅ͠ଓ͚͍ͯΔ͔Ͳ͏͔Λ൑ఆ͢Δ } catch { … } } }
  13. actor ReceiptDetector { private var isDetectingReceipt = false … func

    processVideoFrame(sampleBuffer: CMSampleBuffer) async { if isDetectingReceipt { return } isDetectingReceipt = true defer { isDetectingReceipt = false } … } … }
  14. guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { return } let

    originalImage = CIImage(cvImageBuffer: pixelBuffer) let originalSize = originalImage.extent.size let originalShortSide = min(originalSize.width, originalSize.height) let targetShortSide: CGFloat = 512 let resizeScale = min(1.0, targetShortSide / originalShortSide) let resizedImage = originalImage.transformed( by: CGAffineTransform(scaleX: resizeScale, y: resizeScale) ) …
  15. final class ReceiptTextRecognizer { func recognizeText(sampleBuffer: CMSampleBuffer) async throws ->

    [String] { var request = RecognizeTextRequest(.revision3) request.recognitionLanguages = [ Locale.Language(identifier: "ja-JP"), Locale.Language(identifier: "en-US") ] request.recognitionLevel = .accurate let result = try await request.perform( on: sampleBuffer, orientation: .right ) } }
  16. func recognizeText( sampleBuffer: CMSampleBuffer, receiptRect: NormalizedRect ) async throws ->

    [String] { var request = RecognizeTextRequest(.revision3) let expandedRect = expandNormalizedRect(receiptRect, percent: 0.05) request.regionOfInterest = expandedRect … } private func expandNormalizedRect( _ rect: NormalizedRect, percent: CGFloat ) -> NormalizedRect { // ✅ ۣܗΛ෯ɾߴ͞ͷׂ߹ʢpercentʣ͚࢛ͩํʹ֦ு͢Δ }
  17. { "string": "ίΧɾίʔϥ ϠΧϯϊϜΪνϟ", "bounding_box": { "x": 0.11, "y": 0.70,

    "width": 0.38, "height": 0.01 } }, { "string": "650ml", "bounding_box": { "x": 0.49, "y": 0.70, "width": 0.14, "height": 0.01 } }, …
  18. { "string": "ίΧɾίʔϥ ϠΧϯϊϜΪνϟ", "bounding_box": { "x": 0.11, "y": 0.70,

    "width": 0.38, "height": 0.01 } }, { "string": "650ml", "bounding_box": { "x": 0.49, "y": 0.70, "width": 0.14, "height": 0.01 } }, …
  19. @Generable struct ReceiptInferenceResponse { @Guide(description: "Merchant/store name.") let shopName: String?

    @Guide(description: "Purchase (transaction) date normalized to ISO 8601 `YYYY-MM-DD`.") let date: String? @Guide(description: "Grand total amount paid in JPY (tax- inclusive), as a non-negative integer.") let amount: Int? @Guide(description: "Category for this receipt.") let category: Category? }
  20. final class ReceiptStructureInferrer { private let session: LanguageModelSession init() {

    self.session = LanguageModelSession( instructions: Instructions { """ … """ } ) } }
  21. Instructions { """ You are an information extractor for retail

    receipts. Carefully read through the input text in Japanese, scanned from a receipt. Line breaks correspond to each line of the receipt from top to bottom. Extract receipt information and return it in the provided schema format Each category definition for a receipt is following: - groceries: foodstuff, groceries - diningOut: payments related to restaurants, bars, and other dining expenses - householdSupplies: daily necessities, excluding food items … """ }
  22. final class ReceiptStructureInferrer { … func infer(receiptTextLines: [String]) async throws

    -> ReceiptInferenceResponse { let prompt = Prompt { … } let response = try await session.respond( to: prompt, generating: ReceiptInferenceResponse.self, options: GenerationOptions(temperature: 0.1) ) return response.content } }
  23. Prompt { """ ## Task Extract fields from the following

    OCR lines of a Japanese retail receipt. ## Input \(receiptTextLines.joined(separator: "\n")) ## Requirements - The output format must be valid. - Ensure that the output is complete and not truncated. - If any information is missing or unclear, use "nil" for that field. """ }
  24.  ·ͱΊ w "*ϨγʔτಡΈऔΓػೳ📸 w --.ͷlߏ଄Խzͱ͍͏ڧΈΛ׆͔ͯ҆͠Ձʹߴ͍ਫ਼౓Λ࣮ݱͨ͠ w (BSCBHFJO HBSCBHFPVU🔥 w

    ΠϯϓοτΛ͍͔ʹ៉ྷʹ͢Δ͔͕--.ͷਪ࿦ͷਫ਼౓ʹ௚݁͢Δ w 'PVOEBUJPO.PEFMT🧠 w -PDBM--.ͷϙςϯγϟϧΛײͤͨ͡͞