erlang中的csv解析器

erlang中的csv解析器,csv,erlang,Csv,Erlang,对于我的应用程序,我必须使用Erlang解析CSV文件。以下是使用Erlang解析CSV的代码:- parse_file(Fn) -> {ok, Data} = file:read_file(Fn), parse(binary_to_list(Data)). parse(Data) -> lists:reverse(parse(Data, [])). parse([], Acc) -> Acc; parse(Data, Acc) -> {Line, Tail} = p

对于我的应用程序,我必须使用Erlang解析CSV文件。以下是使用Erlang解析CSV的代码:-

parse_file(Fn) ->
{ok, Data} = file:read_file(Fn),
parse(binary_to_list(Data)).

parse(Data) -> lists:reverse(parse(Data, [])).

parse([], Acc) -> Acc;
parse(Data, Acc) ->
{Line, Tail} = parse_line(Data),
parse(Tail, [Line|Acc]).

parse_line(Data) ->
{Line, Tail} = parse_line(Data, []),
{lists:reverse(Line), Tail}.

parse_line([13,10|Data], Acc) -> {Acc, Data};
parse_line([10|Data], Acc) -> {Acc, Data};
parse_line([13|Data], Acc) -> {Acc, Data};
parse_line([], Acc) -> {Acc, []};
parse_line([$,,$,|Data], Acc) -> parse_line(Data, [""|Acc]);
parse_line([$,|Data], Acc) -> parse_line(Data, Acc);
parse_line(Data, Acc) ->
{Fld, Tail} = parse_field(Data),
parse_line(Tail, [Fld|Acc]).

parse_field([34|Data]) ->
{Fld, Tail} = parse_fieldq(Data, ""),
{lists:reverse(Fld), Tail};
parse_field(Data) ->
{Fld, Tail} = parse_field(Data, ""),
{lists:reverse(Fld), Tail}.

parse_field([$,|Tail], Acc) -> {Acc, [$,|Tail]};
parse_field([13|Tail], Acc) -> {Acc, [13|Tail]};
parse_field([10|Tail], Acc) -> {Acc, [10|Tail]};
parse_field([], Acc) -> {Acc, []};
parse_field([Ch|Tail], Acc) -> parse_field(Tail, [Ch|Acc]).

parse_fieldq([34,34|Tail], Acc) -> parse_fieldq(Tail, [34|Acc]);
parse_fieldq([34|Tail], Acc) -> {Acc, Tail};
parse_fieldq([Ch|Tail], Acc) -> parse_fieldq(Tail, [Ch|Acc]).
此代码工作正常,但存在两个问题:- 1-由于代码使用双引号(“”)和逗号(,)进行解析,并将每个值分隔开。。但在下面的示例中,如果名字中包含双引号,则解析器将再创建一个字段

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------"All Pain Will End."","","itisashwani4u@gmail.com"

result:-
[["contact"],["Ashwani  Garg ------"],["All Pain Will End."],[],["itisashwani4u@gmail.com"]]

expected result:-
[["contact"],["Ashwani  Garg ------All Pain Will End."],[],["itisashwani4u@gmail.com"]]
2-对于以下类型的csv值,其截断某些值:- 姓名、姓氏、中间名、姓名、昵称、电子邮件地址、家乡街道、家乡城市、家乡邮政编码、家乡州、家乡国家/地区、家庭电话、家庭传真、手机、个人网页、商业街道、商业城市、商业邮政编码、商业州、商业国家/地区、商业网页、商业电话、商业传真、寻呼机、公司、职务、,部门、办公地点、备注

    Affection,,,Affection,,,,,,,,+919845141544,,+919845141544,,,,,,,,,,,,,,,
    result:-
    [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[]]
    expected result:-
   [["Affection"],[],[],["Affection"],[],[],[],[],[],[],[],["+919845141544"],[],["+919845141544"],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
请帮助我…如需参考,请使用以下链接:-

不带文件:读取\u行:

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(binary_to_list(Data), []).

parse([], Done) ->
  lists:reverse(Done);

parse(Data, Done) ->
  {Line, Rest} = case re:split(Data, "\r|\n|\r\n", [{return, list}, {parts, 2}]) of
                   [L,R] -> {L,R};
                   [L]   -> {L,[]}
                 end,
  parse(Rest, [parse_line(Line)|Done]).

中还讨论了从文件中读取行。使其适应您的需要应该很简单:

次要问题:

您如何创建CSV输入?它似乎不是有效的CSV(尽管CSV没有特别严格的规范)

通常,要在CSV字段中使用双引号,它们需要作为一对双引号转义,因此您的示例如下:

"Type","First Name","Last Name","Email"
"Contact","Ashwani  Garg ------""All Pain Will End.""","","itisashwani4u@gmail.com"

这将很好地导入到open office电子表格中,而您的原始示例则不然。

另一种可能的解决方案。可以很容易地更改为延迟计算,因此不需要一次读取整个文件

parse(Data) -> parse(Data, [], [], []).

parse([$\r|Data], Field, Fields, Lines) -> parse_r(Data, Field, Fields, Lines);
parse([$\n|Data], Field, Fields, Lines) -> parse(Data, [], [], [[Field|Fields]|Lines]);
parse([$,|Data], Field, Fields, Lines)  -> parse(Data, [], [Field|Fields], Lines);
parse([$"|Data], [], Fields, Lines)     -> parse_q(Data, [], Fields, Lines);
parse([C|Data], Field, Fields, Lines)   -> parse(Data, [C|Field], Fields, Lines);
parse([], Field, Fields, Lines)         -> 
  lists:reverse(
      [lists:reverse(
          [lists:reverse(F) || F <- L]
        ) || L <- [[Field|Fields]|Lines]]
    ).

parse_r([$\n|_] = Data, Field, Fields, Lines) -> parse(Data, Field, Fields, Lines).

parse_q([$"|Data], Field, Fields, Lines) -> parse_qq(Data, Field, Fields, Lines);
parse_q([C|Data], Field, Fields, Lines)  -> parse_q(Data, [C|Field], Fields, Lines).

parse_qq([$"|Data], Field, Fields, Lines) -> parse_q(Data, [$"|Field], Fields, Lines);
parse_qq([C|_] = Data, Field, Fields, Lines)  
  when C == $,; C == $\r; C == $\n        -> parse(Data, Field, Fields, Lines);
parse_qq([], Field, Fields, Lines)        -> parse([], Field, Fields, Lines).
parse(数据)->parse(数据,[],[],[])。
解析([$\r |数据]、字段、字段、行)->解析(数据、字段、字段、行);
解析([$\n|Data],字段,字段,行)->解析(数据,[],[],[[Field |字段]|行]);
解析([$,[Field],Fields,line)->解析(Data,[],[Field | Fields],line);
解析([$”;数据],],字段,行)->parse_q(数据,[],字段,行);
解析([C|Data],字段,字段,行)->解析(数据,[C|Field],字段,行);
解析([],字段,字段,行)->
列表:反向(
[清单:反向](
[列表:反向(F)| | F解析_q(数据,[$”|字段]、字段、行);
解析qq([C | |]=数据、字段、字段、行)
当C==$,;C==$\r;C==$\n->解析(数据、字段、字段、行)时;
解析([],字段,字段,行)->解析([],字段,字段,行)。
我的实现:

-module(csv).

-export([
    parse/1
]).

parse(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);    
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], _FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([$\n | Rest], _FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse(RBuff) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).
但是,此解决方案无法处理嵌套引号

例如:


1,“你好,“世界”,她说:“这是“解决方案,不是吗?”,2000\r\n

前几天我遇到了您的实现,并开始玩弄它

我也给你做了一个解析器

-module(csv_parser).

-export([parse_file/1]).

parse_file(File) ->
  {ok, Data} = file:read_file(File),
  parse(Data).

parse(Data) ->
    Lines = re:split(Data, "\r|\n|\r\n", [] ), 
    [ [begin
           case  re:split(Token, "\"", [] ) of 
               [_,T,_] -> T;
               [T] -> T; % if token is not surrounded by ""
               [] -> <<"">>
           end
       end || Token <- re:split(Line, ",", [] ) ] || Line <- Lines, Line =/= <<"">>].
-模块(csv\u解析器)。
-导出([parse_file/1])。
解析_文件(文件)->
{ok,Data}=file:read_file(file),
解析(数据)。
解析(数据)->
Lines=re:split(数据“\r |\n |\r\n”,[]),
开始
案例re:拆分(标记“\”,[])的
[[uu,T,[uu]>T;
[T] ->T;%如果令牌没有被“”包围
[] -> 
结束

结束| | Token我对zed的答案添加了几个增强功能

-module (helper_csv_parser).
-compile(export_all).

% Taken from http://stackoverflow.com/questions/1532081/csv-parser-in-erlang, modified to fix errors.
parse(File) ->
    {ok, F} = file:open(File, [read, {encoding, utf8}]),
    {ok, L} = file:read_line(F),
    parse(F, string:strip(L, right, $\n), [], 1).

parse(F, eof, Done, _) ->
    file:close(F),
    lists:reverse(Done);

parse(F, Line, Done, Ctr) ->
    Res = file:read_line(F),

    case Res of
        {error,collect_line} -> throw({error, "Might be unicode at line " ++ helper:i2s(Ctr)});
        {ok, L} -> parse(F, string:strip(L, right, $\n),[parse_line(Line)|Done], Ctr+1);
        eof -> parse(F,eof,[parse_line(Line)|Done], Ctr+1)
    end.

parse_line("," ++ Line) -> parse_line(Line, [[]]);
parse_line(Line) -> parse_line(Line, []).

parse_line([], Fields) -> lists:reverse(Fields);
parse_line("," ++ Line, Fields) -> parse_field(Line, Fields);
parse_line(Line, Fields) -> parse_field(Line, Fields).

parse_field("\"" ++ Line, Fields) -> parse_field_q(Line, [], Fields);
parse_field(Line, Fields) -> parse_field(Line, [], Fields).

parse_field("," ++ _ = Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field([C|Line], Buf, Fields) -> parse_field(Line, [C|Buf], Fields);
parse_field([], Buf, Fields) -> parse_line([], [lists:reverse(Buf)|Fields]).

parse_field_q(Line, Fields) -> parse_field_q(Line, [], Fields).
parse_field_q("\"\"" ++ Line, Buf, Fields) -> parse_field_q(Line, [$"|Buf], Fields);
parse_field_q("\"" ++ Line, Buf, Fields) -> parse_line(Line, [string:strip(lists:reverse(Buf))|Fields]);
parse_field_q([C|Line], Buf, Fields) -> parse_field_q(Line, [C|Buf], Fields).

Wisher的答案不错,只是它丢失了每一行csv的最后一个元素。这里有一个修复方法。不过它仍然不处理嵌入的引号

-module(csv).

-export([read/1]).

read(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([$\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).

谢谢你的回复。我已经运行了你的脚本。它运行得很好,也可以排序,但要检查输出。[[“类型”、“名字”、“姓氏”、“电子邮件”]、[[]]、[“联系人”、“空值”、[],”ashishmichael@gmail.com“],[[]]但是你可以看到有一个[[]]在每一条没有任何用处的记录中,我们可以避免…**异常错误:未定义的函数文件:在函数csv\u erl\u parser:parse/1OK中读取\u行/1,我添加了代码,将整个文件读取到内存中,然后从那里开始工作。解析行与以前一样。老实说,我不明白你的意思…你的文件中有两行新行吗,或者什么?Filtered=[E | | E第一个URL似乎与问题末尾的URL相同;)修复了错误:脚本在嵌套引号上失败。现在标记是一行没有引号。
-module(csv).

-export([read/1]).

read(File) ->
    try
        {ok, Bin} = file:read_file(File),
        {ok, parse(binary_to_list(Bin), [], [], [])}
    catch
        Class:Error ->
            {Class, Error}
    end.

parse([], _FBuff, _RBuff, Result) ->
    lists:reverse(Result);
parse([$" | Rest], _FBuff, RBuff, Result) ->
    {F, Rest1} = parse_q(Rest, []),
    parse(Rest1, [], [F | RBuff], Result);
parse([$,, $\s| Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$, | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [lists:reverse(FBuff) | RBuff], Result);
parse([$\r, $\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([$\n | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [], [], [lists:reverse([lists:reverse(FBuff) | RBuff]) | Result]);
parse([A | Rest], FBuff, RBuff, Result) ->
    parse(Rest, [A | FBuff], RBuff, Result).

parse_q([$", $, | Rest], Result) ->
    {lists:reverse(Result), Rest};
parse_q([A | Rest], Result) ->
    parse_q(Rest, [A | Result]).