Unicode字符串中的奇数字符

时间:2012-03-02 21:32:56

标签: delphi mp3

在获得MP3 ID3 v2实施时,我遇到了一些问题。除了这一个问题之外,我大部分时间都在工作,这可能与此根本无关。无论如何,我使用下面的代码来处理检索涉及文本的标题标记的数据。

我遇到的是(我猜?)我在一些不同的字符串中遇到Unicode字符。我试图在下面转换它,它的工作原理。但是我在字符串之前获得了3F的奖金,之后获得了$ 3F $ 3F。我可以对下面的代码做些什么来解析那些或者我必须自己做吗?文件由ITunes编码,如果这有助于任何。

function Id3v2_string(currp: pointer; datasize: integer): string;
{ handles string processing for ID3v2 data }
  const
    IS_TEXT_UNICODE_UNICODE_MASK = $0F;
  var
    outstr: string;
    uscan: integer;
  begin
    outstr := '';
    SetLength(outstr, datasize);
    uscan := IS_TEXT_UNICODE_UNICODE_MASK;
    if IsTextUnicode(currp, datasize, @uscan) then
      outstr := WideCharToString(currp)
    else
      move(currp^, outstr[1], datasize);
    Result := outstr;
  end;

注意,我真的对媒体库不感兴趣,因为我要做的就是编辑ID3标签而不是播放文件 - 除了像这样的一些小问题之外,实现已经完成。

1 个答案:

答案 0 :(得分:6)

根据正在使用的ID3 v2的版本,文本字符串可能会或可能不会以字节开头,以告诉您字符串的实际编码。请勿使用IsTextUnicode()来猜测编码是什么(特别是因为它可以报告false results)。

在ID3 v2到v2.3之前,没有编码字节,文本是ISO-8859-1UCS-2UCS-2字符串始终以BOM开头,因此您知道字节排序。例如:

// prior to Delphi 2009 - String is Ansi
function Id3v2_string(currp: Pointer; datasize: Integer): String; 
var
  W: WideString;
  I: Integer;
  Ch: WideChar;
begin 
  Result := '';
  if (datasize >= SizeOf(Word)) and ((PWord(currp)^ = $FEFF) or (PWord(currp)^= $FFFE)) then begin
    // UCS-2 with BOM
    W := WideCharLenToString(PWideChar(Integer(currp) + SizeOf(Word)), (datasize - SizeOf(Word)) div SizeOf(WideChar)); 
    if PWord(currp)^ = $FFFE then begin
      // BE, convert to LE
      for I := 1 to Length(W) do begin
        Ch := W[I];
        W[I] := WideChar(((Word(Ch) and $FF) shl 8) or (Word(Ch) shr 8));
      end;
    end;
  end else begin
    // ISO-8859-1
    I := MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, nil, 0);
    if I > 0 then begin
      SetLength(W, I);
      MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, PWideChar(W), I);
    end;
  end;
  Result := TrimRight(W);
end; 

// Delphi 2009+ - String is Unicode
function Id3v2_string(currp: Pointer; datasize: Integer): String; 
var
  Enc: TEncoding;

  function Convert(P: Pointer; Size: Integer): String;
  var
    Buf: TBytes;
  begin
    SetLength(Buf, Size);
    if Size > 0 then Move(P^, Buf[0], Size);
    Result := Enc.GetString(Buf);
  end;

begin 
  Result := '';
  if (datasize >= SizeOf(Word)) and ((PWord(currp)^ = $FEFF) or (PWord(currp)^ = $FFFE)) then begin
    // UCS-2 with BOM
    if PWord(currp)^ = $FFFE then begin
      // BE
      Enc := TEncoding.BigEndianUnicode;
    end else begin
      // LE
      Enc := TEncoding.Unicode;
    end;
    Result := Convert(PWord(currp)+1, datasize - SizeOf(Word));
  end else begin
    // ISO-8859-1
    Enc := TEncoding.GetEncoding(28591);
    try
      Result := Convert(currp, datasize);
    finally
      Enc.Free;
    end;
  end;
end; 

ID3 v2.4将UCS-2切换为UTF-16,并在没有BOM的情况下添加对UTF-8UTF-16BE的支持,例如:

// prior to Delphi 2009 - String is Ansi
function Id3v2_string(currp: Pointer; datasize: Integer; Encoding: Byte): String; 
var
  W: WideString;
  I: Integer;
  Ch: WideChar;
begin 
  Result := '';

  case Encoding of
    $00: begin
      // ISO-8859-1
      I := MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, nil, 0);
      if I > 0 then begin
        SetLength(W, I);
        MultiByteToWideChar(28591, 0, PAnsiChar(currp), datasize, PWideChar(W), I);
      end;
    end;
    $01: begin
      // UTF-16 with BOM
      SetString(W, PWideChar(Integer(currp) + SizeOf(Word)), (datasize - SizeOf(Word)) div SizeOf(WideChar));
      if PWord(currp)^ = $FFFE then begin
        // BE, convert to LE
        for I := 1 to Length(W) do begin
          Ch := W[I];
          W[I] := WideChar(((Word(Ch) and $FF) shl 8) or (Word(Ch) shr 8));
        end;
      end;
    end;
    $02: begin
      // UTF-16BE without BOM, convert to LE
      SetString(W, PWideChar(currp), datasize div SizeOf(WideChar));
      for I := 1 to Length(W) do begin
        Ch := W[I];
        W[I] := WideChar(((Word(Ch) and $FF) shl 8) or (Word(Ch) shr 8));
      end;
    end;
    $03: begin
      // UTF-8
      I := MultiByteToWideChar(65001, 0, PAnsiChar(currp), datasize, nil, 0);
      if I > 0 then begin
        SetLength(W, I);
        MultiByteToWideChar(65001, 0, PAnsiChar(currp), datasize, PWideChar(W), I);
      end;
    end;
  end;
  Result := TrimRight(W);
end;

// Delphi 2009+ - String is Unicode
function Id3v2_string(currp: Pointer; datasize: Integer; Encoding: Byte): String; 
var
  Enc: TEncoding;

  function Convert(P: Pointer; Size: Integer): String;
  var
    Buf: TBytes;
  begin
    SetLength(Buf, Size);
    if Size > 0 then Move(P^, Buf[0], Size);
    Result := Enc.GetString(Buf);
  end;

begin 
  Result := '';

  case Encoding of
    $00: begin
      // ISO-8859-1
      Enc := TEncoding.GetEncoding(28591);
      try
        Result := Convert(currp, datasize);
      finally
        Enc.Free;
      end;
    end;
    $01: begin
      // UTF-16 with BOM
      if PWord(currp)^ = $FFFE then begin
        // BE
        Enc := TEncoding.BigEndianUnicode;
      end else begin
        // LE
        Enc := TEncoding.Unicode;
      end;
      Result := Convert(PWord(currp)+1, datasize - SizeOf(Word));
    end;
    $02: begin
      // UTF-16BE without BOM
      Enc := TEncoding.BigEndianUnicode;
      Result := Convert(currp, datasize);
    end;
    $03: begin
      // UTF-8
      Enc := TEncoding.UTF8;
      Result := Convert(currp, datasize);
    end;
  end;
  Result := TrimRight(Result);
end;
相关问题