155 lines
4.6 KiB
PowerShell
155 lines
4.6 KiB
PowerShell
param(
|
|
[Parameter(Mandatory = $false)]
|
|
[string]$InputPath = ".",
|
|
|
|
[Parameter(Mandatory = $false)]
|
|
[string]$OutputDir = "workspace/artifacts/wireframe-gen/extracted"
|
|
)
|
|
|
|
$ErrorActionPreference = "Stop"
|
|
|
|
function New-Slug {
|
|
param([string]$Value)
|
|
$slug = [System.IO.Path]::GetFileNameWithoutExtension($Value).ToLowerInvariant()
|
|
$slug = $slug -replace "[^a-z0-9]+", "-"
|
|
$slug = $slug.Trim("-")
|
|
if ([string]::IsNullOrWhiteSpace($slug)) { return "source" }
|
|
return $slug
|
|
}
|
|
|
|
function Get-TextFileContent {
|
|
param([string]$Path)
|
|
return Get-Content -LiteralPath $Path -Raw -Encoding UTF8
|
|
}
|
|
|
|
function Get-DocxText {
|
|
param([string]$Path)
|
|
Add-Type -AssemblyName System.IO.Compression.FileSystem
|
|
$zip = [System.IO.Compression.ZipFile]::OpenRead((Resolve-Path -LiteralPath $Path))
|
|
try {
|
|
$parts = $zip.Entries | Where-Object {
|
|
$_.FullName -eq "word/document.xml" -or
|
|
$_.FullName -like "word/header*.xml" -or
|
|
$_.FullName -like "word/footer*.xml"
|
|
}
|
|
$texts = New-Object System.Collections.Generic.List[string]
|
|
foreach ($part in $parts) {
|
|
$reader = New-Object System.IO.StreamReader($part.Open())
|
|
try {
|
|
$xml = $reader.ReadToEnd()
|
|
$xml = $xml -replace "</w:p>", "`n"
|
|
$matches = [regex]::Matches($xml, "<w:t[^>]*>(.*?)</w:t>")
|
|
foreach ($match in $matches) {
|
|
$texts.Add([System.Net.WebUtility]::HtmlDecode($match.Groups[1].Value))
|
|
}
|
|
$texts.Add("`n")
|
|
}
|
|
finally {
|
|
$reader.Dispose()
|
|
}
|
|
}
|
|
return (($texts -join "") -replace "`r", "" -replace "`n{3,}", "`n`n").Trim()
|
|
}
|
|
finally {
|
|
$zip.Dispose()
|
|
}
|
|
}
|
|
|
|
function Get-PdfText {
|
|
param([string]$Path)
|
|
$tool = Get-Command pdftotext -ErrorAction SilentlyContinue
|
|
if ($null -eq $tool) {
|
|
return @{
|
|
Text = ""
|
|
Note = "PDF extraction requires pdftotext in PATH. No OCR fallback is used in v1."
|
|
Status = "needs_external_extractor"
|
|
}
|
|
}
|
|
|
|
$tempFile = Join-Path ([System.IO.Path]::GetTempPath()) ("wireframe-pdf-" + [guid]::NewGuid().ToString() + ".txt")
|
|
& $tool.Source "-layout" $Path $tempFile | Out-Null
|
|
$text = Get-Content -LiteralPath $tempFile -Raw -Encoding UTF8
|
|
Remove-Item -LiteralPath $tempFile -Force
|
|
return @{
|
|
Text = $text
|
|
Note = "Extracted with pdftotext."
|
|
Status = "ok"
|
|
}
|
|
}
|
|
|
|
function Get-SourceFiles {
|
|
param([string]$Path)
|
|
$resolved = Resolve-Path -LiteralPath $Path
|
|
$item = Get-Item -LiteralPath $resolved
|
|
$extensions = @(".md", ".markdown", ".txt", ".pdf", ".docx")
|
|
if ($item.PSIsContainer) {
|
|
return Get-ChildItem -LiteralPath $item.FullName -Recurse -File |
|
|
Where-Object { $extensions -contains $_.Extension.ToLowerInvariant() } |
|
|
Sort-Object FullName
|
|
}
|
|
if ($extensions -contains $item.Extension.ToLowerInvariant()) {
|
|
return @($item)
|
|
}
|
|
throw "Unsupported input file type: $($item.Extension)"
|
|
}
|
|
|
|
New-Item -ItemType Directory -Force -Path $OutputDir | Out-Null
|
|
$files = Get-SourceFiles -Path $InputPath
|
|
$sources = New-Object System.Collections.Generic.List[object]
|
|
$index = 1
|
|
|
|
foreach ($file in $files) {
|
|
$sourceId = "SRC-{0:D3}" -f $index
|
|
$extension = $file.Extension.ToLowerInvariant()
|
|
$status = "ok"
|
|
$notes = ""
|
|
$text = ""
|
|
|
|
try {
|
|
switch ($extension) {
|
|
".md" { $text = Get-TextFileContent -Path $file.FullName }
|
|
".markdown" { $text = Get-TextFileContent -Path $file.FullName }
|
|
".txt" { $text = Get-TextFileContent -Path $file.FullName }
|
|
".docx" { $text = Get-DocxText -Path $file.FullName }
|
|
".pdf" {
|
|
$pdf = Get-PdfText -Path $file.FullName
|
|
$text = $pdf.Text
|
|
$notes = $pdf.Note
|
|
$status = $pdf.Status
|
|
}
|
|
}
|
|
}
|
|
catch {
|
|
$status = "error"
|
|
$notes = $_.Exception.Message
|
|
}
|
|
|
|
$slug = New-Slug -Value $file.Name
|
|
$outName = "$sourceId-$slug.txt"
|
|
$outPath = Join-Path $OutputDir $outName
|
|
Set-Content -LiteralPath $outPath -Value $text -Encoding UTF8
|
|
|
|
$sources.Add([ordered]@{
|
|
source_id = $sourceId
|
|
path = $file.FullName
|
|
type = $extension.TrimStart(".")
|
|
status = $status
|
|
extracted_text_path = $outPath
|
|
character_count = $text.Length
|
|
notes = $notes
|
|
})
|
|
$index += 1
|
|
}
|
|
|
|
$inventory = [ordered]@{
|
|
generated_at = (Get-Date).ToUniversalTime().ToString("o")
|
|
input_path = (Resolve-Path -LiteralPath $InputPath).Path
|
|
output_dir = (Resolve-Path -LiteralPath $OutputDir).Path
|
|
sources = $sources
|
|
}
|
|
|
|
$inventoryPath = Join-Path $OutputDir "source_inventory.json"
|
|
$inventory | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $inventoryPath -Encoding UTF8
|
|
Write-Output "Extracted $($sources.Count) source(s) to $OutputDir"
|
|
Write-Output $inventoryPath
|