param( [Parameter(Mandatory = $false)] [string]$InputPath = ".", [Parameter(Mandatory = $false)] [string]$OutputDir = "workspace/artifacts/wireframe-gen/extracted" ) $ErrorActionPreference = "Stop" function New-Slug { param([string]$Value) $slug = [System.IO.Path]::GetFileNameWithoutExtension($Value).ToLowerInvariant() $slug = $slug -replace "[^a-z0-9]+", "-" $slug = $slug.Trim("-") if ([string]::IsNullOrWhiteSpace($slug)) { return "source" } return $slug } function Get-TextFileContent { param([string]$Path) return Get-Content -LiteralPath $Path -Raw -Encoding UTF8 } function Get-DocxText { param([string]$Path) Add-Type -AssemblyName System.IO.Compression.FileSystem $zip = [System.IO.Compression.ZipFile]::OpenRead((Resolve-Path -LiteralPath $Path)) try { $parts = $zip.Entries | Where-Object { $_.FullName -eq "word/document.xml" -or $_.FullName -like "word/header*.xml" -or $_.FullName -like "word/footer*.xml" } $texts = New-Object System.Collections.Generic.List[string] foreach ($part in $parts) { $reader = New-Object System.IO.StreamReader($part.Open()) try { $xml = $reader.ReadToEnd() $xml = $xml -replace "", "`n" $matches = [regex]::Matches($xml, "]*>(.*?)") foreach ($match in $matches) { $texts.Add([System.Net.WebUtility]::HtmlDecode($match.Groups[1].Value)) } $texts.Add("`n") } finally { $reader.Dispose() } } return (($texts -join "") -replace "`r", "" -replace "`n{3,}", "`n`n").Trim() } finally { $zip.Dispose() } } function Get-PdfText { param([string]$Path) $tool = Get-Command pdftotext -ErrorAction SilentlyContinue if ($null -eq $tool) { return @{ Text = "" Note = "PDF extraction requires pdftotext in PATH. No OCR fallback is used in v1." Status = "needs_external_extractor" } } $tempFile = Join-Path ([System.IO.Path]::GetTempPath()) ("wireframe-pdf-" + [guid]::NewGuid().ToString() + ".txt") & $tool.Source "-layout" $Path $tempFile | Out-Null $text = Get-Content -LiteralPath $tempFile -Raw -Encoding UTF8 Remove-Item -LiteralPath $tempFile -Force return @{ Text = $text Note = "Extracted with pdftotext." Status = "ok" } } function Get-SourceFiles { param([string]$Path) $resolved = Resolve-Path -LiteralPath $Path $item = Get-Item -LiteralPath $resolved $extensions = @(".md", ".markdown", ".txt", ".pdf", ".docx") if ($item.PSIsContainer) { return Get-ChildItem -LiteralPath $item.FullName -Recurse -File | Where-Object { $extensions -contains $_.Extension.ToLowerInvariant() } | Sort-Object FullName } if ($extensions -contains $item.Extension.ToLowerInvariant()) { return @($item) } throw "Unsupported input file type: $($item.Extension)" } New-Item -ItemType Directory -Force -Path $OutputDir | Out-Null $files = Get-SourceFiles -Path $InputPath $sources = New-Object System.Collections.Generic.List[object] $index = 1 foreach ($file in $files) { $sourceId = "SRC-{0:D3}" -f $index $extension = $file.Extension.ToLowerInvariant() $status = "ok" $notes = "" $text = "" try { switch ($extension) { ".md" { $text = Get-TextFileContent -Path $file.FullName } ".markdown" { $text = Get-TextFileContent -Path $file.FullName } ".txt" { $text = Get-TextFileContent -Path $file.FullName } ".docx" { $text = Get-DocxText -Path $file.FullName } ".pdf" { $pdf = Get-PdfText -Path $file.FullName $text = $pdf.Text $notes = $pdf.Note $status = $pdf.Status } } } catch { $status = "error" $notes = $_.Exception.Message } $slug = New-Slug -Value $file.Name $outName = "$sourceId-$slug.txt" $outPath = Join-Path $OutputDir $outName Set-Content -LiteralPath $outPath -Value $text -Encoding UTF8 $sources.Add([ordered]@{ source_id = $sourceId path = $file.FullName type = $extension.TrimStart(".") status = $status extracted_text_path = $outPath character_count = $text.Length notes = $notes }) $index += 1 } $inventory = [ordered]@{ generated_at = (Get-Date).ToUniversalTime().ToString("o") input_path = (Resolve-Path -LiteralPath $InputPath).Path output_dir = (Resolve-Path -LiteralPath $OutputDir).Path sources = $sources } $inventoryPath = Join-Path $OutputDir "source_inventory.json" $inventory | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $inventoryPath -Encoding UTF8 Write-Output "Extracted $($sources.Count) source(s) to $OutputDir" Write-Output $inventoryPath