Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b2237d1

Browse filesBrowse files
feat: add extractTextItems for structured text extraction with positional data
1 parent 4573b27 commit b2237d1
Copy full SHA for b2237d1

3 files changed

+90-2Lines changed: 90 additions & 2 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎src/index.ts‎

Copy file name to clipboardExpand all lines: src/index.ts
+7-1Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import { extractImages as _extractImages, renderPageAsImage as _renderPageAsImage } from './image'
22
import { extractLinks as _extractLinks } from './links'
33
import { getMeta as _getMeta } from './meta'
4-
import { extractText as _extractText } from './text'
4+
import { extractText as _extractText, extractTextItems as _extractTextItems } from './text'
55
import { resolvePDFJSImport } from './utils'
66

77
export { configureUnPDF, definePDFJSModule } from './config'
88
export { createIsomorphicCanvasFactory } from './image'
9+
export type { StructuredTextItem } from './text'
910

1011
export {
1112
getDocumentProxy,
@@ -23,6 +24,11 @@ export const extractText: typeof _extractText = async (...args) => {
2324
return await (_extractText as any)(...args)
2425
}
2526

27+
export const extractTextItems: typeof _extractTextItems = async (...args) => {
28+
await resolvePDFJSImport()
29+
return await _extractTextItems(...args)
30+
}
31+
2632
export const extractImages: typeof _extractImages = async (...args) => {
2733
await resolvePDFJSImport()
2834
return await _extractImages(...args)
Collapse file

‎src/text.ts‎

Copy file name to clipboardExpand all lines: src/text.ts
+59-1Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,64 @@
1-
import type { DocumentInitParameters, PDFDocumentProxy, TextItem } from 'pdfjs-dist/types/src/display/api'
1+
import type { DocumentInitParameters, PDFDocumentProxy, TextItem, TextStyle } from 'pdfjs-dist/types/src/display/api'
22
import { getDocumentProxy, isPDFDocumentProxy } from './utils'
33

4+
export interface StructuredTextItem {
5+
/** Text content. */
6+
str: string
7+
/** X position in PDF coordinate space (origin: bottom-left). */
8+
x: number
9+
/** Y position in PDF coordinate space (origin: bottom-left). */
10+
y: number
11+
/** Width in device space. */
12+
width: number
13+
/** Height in device space. */
14+
height: number
15+
/** Font size derived from the transformation matrix. */
16+
fontSize: number
17+
/** Font family name. */
18+
fontFamily: string
19+
/** Text direction: `"ltr"`, `"rtl"`, or `"ttb"`. */
20+
dir: string
21+
/** Whether the text item is followed by a line break. */
22+
hasEOL: boolean
23+
}
24+
25+
export async function extractTextItems(
26+
data: DocumentInitParameters['data'] | PDFDocumentProxy,
27+
): Promise<{ totalPages: number, items: StructuredTextItem[][] }> {
28+
const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data)
29+
const items = await Promise.all(
30+
Array.from({ length: pdf.numPages }, (_, i) => getPageTextItems(pdf, i + 1)),
31+
)
32+
33+
return { totalPages: pdf.numPages, items }
34+
}
35+
36+
async function getPageTextItems(
37+
document: PDFDocumentProxy,
38+
pageNumber: number,
39+
): Promise<StructuredTextItem[]> {
40+
const page = await document.getPage(pageNumber)
41+
const content = await page.getTextContent()
42+
const styles = content.styles as Record<string, TextStyle>
43+
44+
return (content.items as TextItem[])
45+
.filter(item => item.str != null)
46+
.map((item) => {
47+
const [_a, _b, c, d, e, f] = item.transform
48+
return {
49+
str: item.str,
50+
x: e,
51+
y: f,
52+
width: item.width,
53+
height: item.height,
54+
fontSize: Math.hypot(c, d),
55+
fontFamily: styles[item.fontName]?.fontFamily ?? '',
56+
dir: item.dir,
57+
hasEOL: item.hasEOL,
58+
}
59+
})
60+
}
61+
462
export function extractText(
563
data: DocumentInitParameters['data'] | PDFDocumentProxy,
664
options?: { mergePages?: false },
Collapse file

‎test/index.test.ts‎

Copy file name to clipboardExpand all lines: test/index.test.ts
+24Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
extractImages,
99
extractLinks,
1010
extractText,
11+
extractTextItems,
1112
getDocumentProxy,
1213
getMeta,
1314
getResolvedPDFJS,
@@ -61,6 +62,29 @@ describe('unpdf', () => {
6162
expect(totalPages).toMatchInlineSnapshot('1')
6263
})
6364

65+
it('extracts structured text items from a PDF', async () => {
66+
const { items, totalPages } = await extractTextItems(await getPDF())
67+
68+
expect(totalPages).toBe(1)
69+
expect(items).toHaveLength(1)
70+
expect(items[0]!.length).toBeGreaterThan(0)
71+
72+
const firstItem = items[0]![0]!
73+
expect(firstItem).toMatchInlineSnapshot(`
74+
{
75+
"dir": "ltr",
76+
"fontFamily": "sans-serif",
77+
"fontSize": 16.1,
78+
"hasEOL": false,
79+
"height": 16.1,
80+
"str": "Dummy PDF file",
81+
"width": 123.41130000000003,
82+
"x": 56.8,
83+
"y": 758.1,
84+
}
85+
`)
86+
})
87+
6488
it('extracts links from a PDF', async () => {
6589
const { links, totalPages } = await extractLinks(await getPDF('links.pdf'))
6690
expect(links.length).toMatchInlineSnapshot('4')

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.