Added to Git
This commit is contained in:
@@ -0,0 +1,154 @@
|
||||
param(
|
||||
[Parameter(Mandatory = $false)]
|
||||
[string]$InputPath = ".",
|
||||
|
||||
[Parameter(Mandatory = $false)]
|
||||
[string]$OutputDir = "workspace/artifacts/wireframe-gen/extracted"
|
||||
)
|
||||
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
function New-Slug {
|
||||
param([string]$Value)
|
||||
$slug = [System.IO.Path]::GetFileNameWithoutExtension($Value).ToLowerInvariant()
|
||||
$slug = $slug -replace "[^a-z0-9]+", "-"
|
||||
$slug = $slug.Trim("-")
|
||||
if ([string]::IsNullOrWhiteSpace($slug)) { return "source" }
|
||||
return $slug
|
||||
}
|
||||
|
||||
function Get-TextFileContent {
|
||||
param([string]$Path)
|
||||
return Get-Content -LiteralPath $Path -Raw -Encoding UTF8
|
||||
}
|
||||
|
||||
function Get-DocxText {
|
||||
param([string]$Path)
|
||||
Add-Type -AssemblyName System.IO.Compression.FileSystem
|
||||
$zip = [System.IO.Compression.ZipFile]::OpenRead((Resolve-Path -LiteralPath $Path))
|
||||
try {
|
||||
$parts = $zip.Entries | Where-Object {
|
||||
$_.FullName -eq "word/document.xml" -or
|
||||
$_.FullName -like "word/header*.xml" -or
|
||||
$_.FullName -like "word/footer*.xml"
|
||||
}
|
||||
$texts = New-Object System.Collections.Generic.List[string]
|
||||
foreach ($part in $parts) {
|
||||
$reader = New-Object System.IO.StreamReader($part.Open())
|
||||
try {
|
||||
$xml = $reader.ReadToEnd()
|
||||
$xml = $xml -replace "</w:p>", "`n"
|
||||
$matches = [regex]::Matches($xml, "<w:t[^>]*>(.*?)</w:t>")
|
||||
foreach ($match in $matches) {
|
||||
$texts.Add([System.Net.WebUtility]::HtmlDecode($match.Groups[1].Value))
|
||||
}
|
||||
$texts.Add("`n")
|
||||
}
|
||||
finally {
|
||||
$reader.Dispose()
|
||||
}
|
||||
}
|
||||
return (($texts -join "") -replace "`r", "" -replace "`n{3,}", "`n`n").Trim()
|
||||
}
|
||||
finally {
|
||||
$zip.Dispose()
|
||||
}
|
||||
}
|
||||
|
||||
function Get-PdfText {
|
||||
param([string]$Path)
|
||||
$tool = Get-Command pdftotext -ErrorAction SilentlyContinue
|
||||
if ($null -eq $tool) {
|
||||
return @{
|
||||
Text = ""
|
||||
Note = "PDF extraction requires pdftotext in PATH. No OCR fallback is used in v1."
|
||||
Status = "needs_external_extractor"
|
||||
}
|
||||
}
|
||||
|
||||
$tempFile = Join-Path ([System.IO.Path]::GetTempPath()) ("wireframe-pdf-" + [guid]::NewGuid().ToString() + ".txt")
|
||||
& $tool.Source "-layout" $Path $tempFile | Out-Null
|
||||
$text = Get-Content -LiteralPath $tempFile -Raw -Encoding UTF8
|
||||
Remove-Item -LiteralPath $tempFile -Force
|
||||
return @{
|
||||
Text = $text
|
||||
Note = "Extracted with pdftotext."
|
||||
Status = "ok"
|
||||
}
|
||||
}
|
||||
|
||||
function Get-SourceFiles {
|
||||
param([string]$Path)
|
||||
$resolved = Resolve-Path -LiteralPath $Path
|
||||
$item = Get-Item -LiteralPath $resolved
|
||||
$extensions = @(".md", ".markdown", ".txt", ".pdf", ".docx")
|
||||
if ($item.PSIsContainer) {
|
||||
return Get-ChildItem -LiteralPath $item.FullName -Recurse -File |
|
||||
Where-Object { $extensions -contains $_.Extension.ToLowerInvariant() } |
|
||||
Sort-Object FullName
|
||||
}
|
||||
if ($extensions -contains $item.Extension.ToLowerInvariant()) {
|
||||
return @($item)
|
||||
}
|
||||
throw "Unsupported input file type: $($item.Extension)"
|
||||
}
|
||||
|
||||
New-Item -ItemType Directory -Force -Path $OutputDir | Out-Null
|
||||
$files = Get-SourceFiles -Path $InputPath
|
||||
$sources = New-Object System.Collections.Generic.List[object]
|
||||
$index = 1
|
||||
|
||||
foreach ($file in $files) {
|
||||
$sourceId = "SRC-{0:D3}" -f $index
|
||||
$extension = $file.Extension.ToLowerInvariant()
|
||||
$status = "ok"
|
||||
$notes = ""
|
||||
$text = ""
|
||||
|
||||
try {
|
||||
switch ($extension) {
|
||||
".md" { $text = Get-TextFileContent -Path $file.FullName }
|
||||
".markdown" { $text = Get-TextFileContent -Path $file.FullName }
|
||||
".txt" { $text = Get-TextFileContent -Path $file.FullName }
|
||||
".docx" { $text = Get-DocxText -Path $file.FullName }
|
||||
".pdf" {
|
||||
$pdf = Get-PdfText -Path $file.FullName
|
||||
$text = $pdf.Text
|
||||
$notes = $pdf.Note
|
||||
$status = $pdf.Status
|
||||
}
|
||||
}
|
||||
}
|
||||
catch {
|
||||
$status = "error"
|
||||
$notes = $_.Exception.Message
|
||||
}
|
||||
|
||||
$slug = New-Slug -Value $file.Name
|
||||
$outName = "$sourceId-$slug.txt"
|
||||
$outPath = Join-Path $OutputDir $outName
|
||||
Set-Content -LiteralPath $outPath -Value $text -Encoding UTF8
|
||||
|
||||
$sources.Add([ordered]@{
|
||||
source_id = $sourceId
|
||||
path = $file.FullName
|
||||
type = $extension.TrimStart(".")
|
||||
status = $status
|
||||
extracted_text_path = $outPath
|
||||
character_count = $text.Length
|
||||
notes = $notes
|
||||
})
|
||||
$index += 1
|
||||
}
|
||||
|
||||
$inventory = [ordered]@{
|
||||
generated_at = (Get-Date).ToUniversalTime().ToString("o")
|
||||
input_path = (Resolve-Path -LiteralPath $InputPath).Path
|
||||
output_dir = (Resolve-Path -LiteralPath $OutputDir).Path
|
||||
sources = $sources
|
||||
}
|
||||
|
||||
$inventoryPath = Join-Path $OutputDir "source_inventory.json"
|
||||
$inventory | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $inventoryPath -Encoding UTF8
|
||||
Write-Output "Extracted $($sources.Count) source(s) to $OutputDir"
|
||||
Write-Output $inventoryPath
|
||||
Reference in New Issue
Block a user