public void processEncodedText(byte[] string) throws IOException {
/* Note on variable names. There are three different units being used
* in this code. Character sizes are given in glyph units, text locations
* are initially given in text units, and we want to save the data in
* display units. The variable names should end with Text or Disp to
* represent if the values are in text or disp units (no glyph units are saved).
*/
final float fontSizeText = graphicsState.getTextState().getFontSize();
final float horizontalScalingText = graphicsState.getTextState().getHorizontalScalingPercent()/100f;
//float verticalScalingText = horizontalScaling;//not sure if this is right but what else to do???
final float riseText = graphicsState.getTextState().getRise();
final float wordSpacingText = graphicsState.getTextState().getWordSpacing();
final float characterSpacingText = graphicsState.getTextState().getCharacterSpacing();
//We won't know the actual number of characters until
//we process the byte data(could be two bytes each) but
//it won't ever be more than string.length*2(there are some cases
//were a single byte will result in two output characters "fi"
final PDFont font = graphicsState.getTextState().getFont();
//This will typically be 1000 but in the case of a type3 font
//this might be a different number
final float glyphSpaceToTextSpaceFactor = 1f/font.getFontMatrix().getValue( 0, 0 );
float spaceWidthText=0;
try{ // to avoid crash as described in PDFBOX-614
// lets see what the space displacement should be
spaceWidthText = (font.getFontWidth( SPACE_BYTES, 0, 1 )/glyphSpaceToTextSpaceFactor);
}catch (Throwable exception)
{
log.warn( exception, exception);
}
if( spaceWidthText == 0 )
{
spaceWidthText = (font.getAverageFontWidth()/glyphSpaceToTextSpaceFactor);
//The average space width appears to be higher than necessary
//so lets make it a little bit smaller.
spaceWidthText *= .80f;
}
/* Convert textMatrix to display units */
final Matrix initialMatrix = new Matrix();
initialMatrix.setValue(0,0,1);
initialMatrix.setValue(0,1,0);
initialMatrix.setValue(0,2,0);
initialMatrix.setValue(1,0,0);
initialMatrix.setValue(1,1,1);
initialMatrix.setValue(1,2,0);
initialMatrix.setValue(2,0,0);
initialMatrix.setValue(2,1,riseText);
initialMatrix.setValue(2,2,1);
final Matrix ctm = graphicsState.getCurrentTransformationMatrix();
final Matrix dispMatrix = initialMatrix.multiply( ctm );
Matrix textMatrixStDisp = textMatrix.multiply( dispMatrix );
Matrix textMatrixEndDisp = null;
final float xScaleDisp = textMatrixStDisp.getXScale();
final float yScaleDisp = textMatrixStDisp.getYScale();
final float spaceWidthDisp = spaceWidthText * xScaleDisp * fontSizeText;
final float wordSpacingDisp = wordSpacingText * xScaleDisp * fontSizeText;
float maxVerticalDisplacementText = 0;
float[] individualWidthsBuffer = new float[string.length];
StringBuilder characterBuffer = new StringBuilder(string.length);
int codeLength = 1;
for( int i=0; i< string.length; i+=codeLength )
{
// Decode the value to a Unicode character
codeLength = 1;
String c = font.encode( string, i, codeLength );
if( c == null && i+1< string.length)
{
//maybe a multibyte encoding
codeLength++;
c = font.encode( string, i, codeLength );
}
//todo, handle horizontal displacement
// get the width and height of this character in text units
float characterHorizontalDisplacementText =
(font.getFontWidth( string, i, codeLength )/glyphSpaceToTextSpaceFactor);
maxVerticalDisplacementText =
Math.max(
maxVerticalDisplacementText,
font.getFontHeight( string, i, codeLength)/glyphSpaceToTextSpaceFactor);
// PDF Spec - 5.5.2 Word Spacing
//
// Word spacing works the same was as character spacing, but applies
// only to the space character, code 32.
//
// Note: Word spacing is applied to every occurrence of the single-byte
// character code 32 in a string. This can occur when using a simple
// font or a composite font that defines code 32 as a single-byte code.
// It does not apply to occurrences of the byte value 32 in multiple-byte
// codes.
//
// RDD - My interpretation of this is that only character code 32's that
// encode to spaces should have word spacing applied. Cases have been
// observed where a font has a space character with a character code
// other than 32, and where word spacing (Tw) was used. In these cases,
// applying word spacing to either the non-32 space or to the character
// code 32 non-space resulted in errors consistent with this interpretation.
//
float spacingText = characterSpacingText;
if( (string[i] == 0x20) && codeLength == 1 )
{
spacingText += wordSpacingText;
}
/* The text matrix gets updated after each glyph is placed. The updated
* version will have the X and Y coordinates for the next glyph.
*/
Matrix glyphMatrixStDisp = textMatrix.multiply( dispMatrix );
//The adjustment will always be zero. The adjustment as shown in the
//TJ operator will be handled separately.
float adjustment=0;
// TODO : tx should be set for horizontal text and ty for vertical text
// which seems to be specified in the font (not the direction in the matrix).
float tx = ((characterHorizontalDisplacementText-adjustment/glyphSpaceToTextSpaceFactor)*fontSizeText)
* horizontalScalingText;
float ty = 0;
Matrix td = new Matrix();
td.setValue( 2, 0, tx );
td.setValue( 2, 1, ty );
textMatrix = td.multiply( textMatrix );
Matrix glyphMatrixEndDisp = textMatrix.multiply( dispMatrix );
float sx = spacingText * horizontalScalingText;
float sy = 0;
Matrix sd = new Matrix();
sd.setValue( 2, 0, sx );
sd.setValue( 2, 1, sy );
textMatrix = sd.multiply( textMatrix );
// determine the width of this character
// XXX: Note that if we handled vertical text, we should be using Y here
float widthText = glyphMatrixEndDisp.getXPosition() - glyphMatrixStDisp.getXPosition();
while( characterBuffer.length() + ( c != null ? c.length() : 1 ) > individualWidthsBuffer.length )
{
float[] tmp = new float[individualWidthsBuffer.length * 2];
System.arraycopy( individualWidthsBuffer, 0, tmp, 0, individualWidthsBuffer.length );
individualWidthsBuffer = tmp;
}
//there are several cases where one character code will
//output multiple characters. For example "fi" or a
//glyphname that has no mapping like "visiblespace"
if( c != null )
{
Arrays.fill(
individualWidthsBuffer,
characterBuffer.length(),
characterBuffer.length() + c.length(),
widthText / c.length());
validCharCnt += c.length();
}
else
{
// PDFBOX-373: Replace a null entry with "?" so it is
// not printed as "(null)"
c = "?";
individualWidthsBuffer[characterBuffer.length()] = widthText;
}
characterBuffer.append(c);
totalCharCnt += c.length();
if( spacingText == 0 && (i + codeLength) < (string.length - 1) )
{
continue;
}
textMatrixEndDisp = glyphMatrixEndDisp;
float totalVerticalDisplacementDisp = maxVerticalDisplacementText * fontSizeText * yScaleDisp;
float[] individualWidths = new float[characterBuffer.length()];
System.arraycopy( individualWidthsBuffer, 0, individualWidths, 0, individualWidths.length );
// process the decoded text
processTextPosition(
new TextPosition(
page,
textMatrixStDisp,
textMatrixEndDisp,
totalVerticalDisplacementDisp,
individualWidths,
spaceWidthDisp,
characterBuffer.toString(),
font,
fontSizeText,
(int)(fontSizeText * textMatrix.getXScale()),
wordSpacingDisp ));
textMatrixStDisp = textMatrix.multiply( dispMatrix );
characterBuffer.setLength(0);
}
}
Process encoded text from the PDF Stream.
You should override this method if you want to perform an action when
encoded text is being processed. |